re-commit r361928: [PowerPC] [Clang] Port SSE intrinsics to PowerPC

author Zi Xuan Wu <wuzish@cn.ibm.com>

Fri, 31 May 2019 04:42:13 +0000 (04:42 +0000)

committer Zi Xuan Wu <wuzish@cn.ibm.com>

Fri, 31 May 2019 04:42:13 +0000 (04:42 +0000)
author Zi Xuan Wu <wuzish@cn.ibm.com>
Fri, 31 May 2019 04:42:13 +0000 (04:42 +0000)
committer Zi Xuan Wu <wuzish@cn.ibm.com>
Fri, 31 May 2019 04:42:13 +0000 (04:42 +0000)
diff --git a/lib/Headers/CMakeLists.txt b/lib/Headers/CMakeLists.txt

index 392ca2ae391c911be70758049fb7e2f26ae90320..f7a3e5410ced5ee4404de71e1ff88c23687fd353 100644 (file)
--- a/lib/Headers/CMakeLists.txt
+++ b/lib/Headers/CMakeLists.txt
@@ -126,6 +126,8 @@ set(cuda_wrapper_files
  
  set(ppc_wrapper_files
    ppc_wrappers/mmintrin.h
+  ppc_wrappers/xmmintrin.h
+  ppc_wrappers/mm_malloc.h
  )
  
  set(openmp_wrapper_files
diff --git a/lib/Headers/ppc_wrappers/mm_malloc.h b/lib/Headers/ppc_wrappers/mm_malloc.h

new file mode 100644 (file)

index 0000000..3658919
--- /dev/null
+++ b/lib/Headers/ppc_wrappers/mm_malloc.h
@@ -0,0 +1,48 @@
+/*===---- mm_malloc.h - Implementation of _mm_malloc and _mm_free ----------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _MM_MALLOC_H_INCLUDED
+#define _MM_MALLOC_H_INCLUDED
+
+#include <stdlib.h>
+
+/* We can't depend on <stdlib.h> since the prototype of posix_memalign
+   may not be visible.  */
+#ifndef __cplusplus
+extern int posix_memalign (void **, size_t, size_t);
+#else
+extern "C" int posix_memalign (void **, size_t, size_t) throw ();
+#endif
+
+static __inline void *
+_mm_malloc (size_t size, size_t alignment)
+{
+  /* PowerPC64 ELF V2 ABI requires quadword alignment.  */
+  size_t vec_align = sizeof (__vector float);
+  /* Linux GLIBC malloc alignment is at least 2 X ptr size.  */
+  size_t malloc_align = (sizeof (void *) + sizeof (void *));
+  void *ptr;
+
+  if (alignment == malloc_align && alignment == vec_align)
+    return malloc (size);
+  if (alignment < vec_align)
+    alignment = vec_align;
+  if (posix_memalign (&ptr, alignment, size) == 0)
+    return ptr;
+  else
+    return NULL;
+}
+
+static __inline void
+_mm_free (void * ptr)
+{
+  free (ptr);
+}
+
+#endif /* _MM_MALLOC_H_INCLUDED */
diff --git a/lib/Headers/ppc_wrappers/xmmintrin.h b/lib/Headers/ppc_wrappers/xmmintrin.h

new file mode 100644 (file)

index 0000000..1b322b6
--- /dev/null
+++ b/lib/Headers/ppc_wrappers/xmmintrin.h
@@ -0,0 +1,1838 @@
+/*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header file is to help porting code using Intel intrinsics
+   explicitly from x86_64 to powerpc64/powerpc64le.
+
+   Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
+   VMX/VSX ISA is a good match for vector float SIMD operations.
+   However scalar float operations in vector (XMM) registers require
+   the POWER8 VSX ISA (2.07) level. There are differences for data
+   format and placement of float scalars in the vector register, which
+   require extra steps to match SSE scalar float semantics on POWER.
+
+   It should be noted that there's much difference between X86_64's
+   MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
+   portable <fenv.h> instead of access MXSCR directly.
+
+   Most SSE scalar float intrinsic operations can be performed more
+   efficiently as C language float scalar operations or optimized to
+   use vector SIMD operations. We recommend this for new applications. */
+#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
+#endif
+
+#ifndef _XMMINTRIN_H_INCLUDED
+#define _XMMINTRIN_H_INCLUDED
+
+/* Define four value permute mask */
+#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
+
+#include <altivec.h>
+
+/* Avoid collisions between altivec.h and strict adherence to C++ and
+   C11 standards.  This should eventually be done inside altivec.h itself,
+   but only after testing a full distro build.  */
+#if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
+                                (defined(__STDC_VERSION__) &&  \
+                                 __STDC_VERSION__ >= 201112L))
+#undef vector
+#undef pixel
+#undef bool
+#endif
+
+/* We need type definitions from the MMX header file.  */
+#include <mmintrin.h>
+
+/* Get _mm_malloc () and _mm_free ().  */
+#if __STDC_HOSTED__
+#include <mm_malloc.h>
+#endif
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+
+/* Unaligned version of the same type.  */
+typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
+                                      __aligned__ (1)));
+
+/* Internal data types for implementing the intrinsics.  */
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+/* Create an undefined vector.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_ps (void)
+{
+  __m128 __Y = __Y;
+  return __Y;
+}
+
+/* Create a vector of zeros.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_ps (void)
+{
+  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
+}
+
+/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ps (float const *__P)
+{
+  return ((__m128)vec_ld(0, (__v4sf*)__P));
+}
+
+/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_ps (float const *__P)
+{
+  return (vec_vsx_ld(0, __P));
+}
+
+/* Load four SPFP values in reverse order.  The address must be aligned.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadr_ps (float const *__P)
+{
+  __v4sf   __tmp;
+  __m128 result;
+  static const __vector unsigned char permute_vector =
+    { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
+       0x17, 0x10, 0x11, 0x12, 0x13 };
+
+  __tmp = vec_ld (0, (__v4sf *) __P);
+  result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
+  return result;
+}
+
+/* Create a vector with all four elements equal to F.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_ps (float __F)
+{
+  return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ps1 (float __F)
+{
+  return _mm_set1_ps (__F);
+}
+
+/* Create the vector [Z Y X W].  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
+{
+  return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
+}
+
+/* Create the vector [W X Y Z].  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_ps (float __Z, float __Y, float __X, float __W)
+{
+  return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
+}
+
+/* Store four SPFP values.  The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ps (float *__P, __m128 __A)
+{
+  vec_st((__v4sf)__A, 0, (__v4sf*)__P);
+}
+
+/* Store four SPFP values.  The address need not be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_ps (float *__P, __m128 __A)
+{
+  *(__m128_u *)__P = __A;
+}
+
+/* Store four SPFP values in reverse order.  The address must be aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storer_ps (float *__P, __m128 __A)
+{
+  __v4sf   __tmp;
+  static const __vector unsigned char permute_vector =
+    { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
+       0x17, 0x10, 0x11, 0x12, 0x13 };
+
+  __tmp = (__m128) vec_perm (__A, __A, permute_vector);
+
+  _mm_store_ps (__P, __tmp);
+}
+
+/* Store the lower SPFP value across four words.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store1_ps (float *__P, __m128 __A)
+{
+  __v4sf __va = vec_splat((__v4sf)__A, 0);
+  _mm_store_ps (__P, __va);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ps1 (float *__P, __m128 __A)
+{
+  _mm_store1_ps (__P, __A);
+}
+
+/* Create a vector with element 0 as F and the rest zero.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ss (float __F)
+{
+  return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
+}
+
+/* Sets the low SPFP value of A from the low value of B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+
+  return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
+}
+
+/* Create a vector with element 0 as *P and the rest zero.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ss (float const *__P)
+{
+  return _mm_set_ss (*__P);
+}
+
+/* Stores the lower SPFP value.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ss (float *__P, __m128 __A)
+{
+  *__P = ((__v4sf)__A)[0];
+}
+
+/* Perform the respective operation on the lower SPFP (single-precision
+   floating-point) values of A and B; the upper three SPFP values are
+   passed through from A.  */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_ss (__m128 __A, __m128 __B)
+{
+#ifdef _ARCH_PWR7
+  __m128 a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+     results. So to insure we don't generate spurious exceptions
+     (from the upper double values) we splat the lower double
+     before we to the operation.  */
+  a = vec_splat (__A, 0);
+  b = vec_splat (__B, 0);
+  c = a + b;
+  /* Then we merge the lower float result with the original upper
+     float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+#else
+  __A[0] = __A[0] + __B[0];
+  return (__A);
+#endif
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_ss (__m128 __A, __m128 __B)
+{
+#ifdef _ARCH_PWR7
+  __m128 a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+     results. So to insure we don't generate spurious exceptions
+     (from the upper double values) we splat the lower double
+     before we to the operation.  */
+  a = vec_splat (__A, 0);
+  b = vec_splat (__B, 0);
+  c = a - b;
+  /* Then we merge the lower float result with the original upper
+     float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+#else
+  __A[0] = __A[0] - __B[0];
+  return (__A);
+#endif
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_ss (__m128 __A, __m128 __B)
+{
+#ifdef _ARCH_PWR7
+  __m128 a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+     results. So to insure we don't generate spurious exceptions
+     (from the upper double values) we splat the lower double
+     before we to the operation.  */
+  a = vec_splat (__A, 0);
+  b = vec_splat (__B, 0);
+  c = a * b;
+  /* Then we merge the lower float result with the original upper
+     float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+#else
+  __A[0] = __A[0] * __B[0];
+  return (__A);
+#endif
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_ss (__m128 __A, __m128 __B)
+{
+#ifdef _ARCH_PWR7
+  __m128 a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+     results. So to insure we don't generate spurious exceptions
+     (from the upper double values) we splat the lower double
+     before we to the operation.  */
+  a = vec_splat (__A, 0);
+  b = vec_splat (__B, 0);
+  c = a / b;
+  /* Then we merge the lower float result with the original upper
+     float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+#else
+  __A[0] = __A[0] / __B[0];
+  return (__A);
+#endif
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_ss (__m128 __A)
+{
+  __m128 a, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper double values) we splat the lower double
+   * before we to the operation. */
+  a = vec_splat (__A, 0);
+  c = vec_sqrt (a);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+}
+
+/* Perform the respective operation on the four SPFP values in A and B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) ((__v4sf)__A + (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) ((__v4sf)__A - (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) ((__v4sf)__A * (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) ((__v4sf)__A / (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_ps (__m128 __A)
+{
+  return (vec_sqrt ((__v4sf)__A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_ps (__m128 __A)
+{
+  return (vec_re ((__v4sf)__A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_ps (__m128 __A)
+{
+  return (vec_rsqrte (__A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_ss (__m128 __A)
+{
+  __m128 a, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper double values) we splat the lower double
+   * before we to the operation. */
+  a = vec_splat (__A, 0);
+  c = _mm_rcp_ps (a);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_ss (__m128 __A)
+{
+  __m128 a, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower double)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper double values) we splat the lower double
+   * before we to the operation. */
+  a = vec_splat (__A, 0);
+  c = vec_rsqrte (a);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return (vec_sel (__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_ss (__m128 __A, __m128 __B)
+{
+  __v4sf a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower float)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper float values) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf)__A, 0);
+  b = vec_splat ((__v4sf)__B, 0);
+  c = vec_min (a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return (vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_ss (__m128 __A, __m128 __B)
+{
+  __v4sf a, b, c;
+  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
+  /* PowerISA VSX does not allow partial (for just lower float)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper float values) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat (__A, 0);
+  b = vec_splat (__B, 0);
+  c = vec_max (a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return (vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_ps (__m128 __A, __m128 __B)
+{
+  __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
+  return vec_sel (__B, __A, m);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_ps (__m128 __A, __m128 __B)
+{
+  __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
+  return vec_sel (__B, __A, m);
+}
+
+/* Perform logical bit-wise operations on 128-bit values.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
+//  return __builtin_ia32_andps (__A, __B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
+}
+
+/* Perform a comparison on the four SPFP values of A and B.  For each
+   element, if the comparison is true, place a mask of all ones in the
+   result, otherwise a mask of zeros.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_ps (__m128  __A, __m128  __B)
+{
+  __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
+  return ((__m128)vec_nor (temp, temp));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_ps (__m128 __A, __m128 __B)
+{
+  return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
+}
+
+extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_ps (__m128  __A, __m128  __B)
+{
+  __vector unsigned int a, b;
+  __vector unsigned int c, d;
+  static const __vector unsigned int float_exp_mask =
+    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+
+  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
+  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
+  c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
+  d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
+  return ((__m128 ) vec_and (c, d));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_ps (__m128 __A, __m128 __B)
+{
+  __vector unsigned int a, b;
+  __vector unsigned int c, d;
+  static const __vector unsigned int float_exp_mask =
+    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+
+  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
+  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
+  c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
+  d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
+  return ((__m128 ) vec_or (c, d));
+}
+
+/* Perform a comparison on the lower SPFP values of A and B.  If the
+   comparison is true, place a mask of all ones in the result, otherwise a
+   mask of zeros.  The upper three SPFP values are passed through from A.  */
+extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_ss (__m128  __A, __m128  __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpeq(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmplt(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmple(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpgt(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpge(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpeq(a, b);
+  c = vec_nor (c, c);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpge(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmpgt(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we to the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmple(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_ss (__m128 __A, __m128 __B)
+{
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+  __v4sf a, b, c;
+  /* PowerISA VMX does not allow partial (for just element 0)
+   * results. So to insure we don't generate spurious exceptions
+   * (from the upper elements) we splat the lower float
+   * before we do the operation. */
+  a = vec_splat ((__v4sf) __A, 0);
+  b = vec_splat ((__v4sf) __B, 0);
+  c = (__v4sf) vec_cmplt(a, b);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_ss (__m128 __A, __m128 __B)
+{
+  __vector unsigned int a, b;
+  __vector unsigned int c, d;
+  static const __vector unsigned int float_exp_mask =
+    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+
+  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
+  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
+  c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
+  d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
+  c = vec_and (c, d);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_ss (__m128 __A, __m128 __B)
+{
+  __vector unsigned int a, b;
+  __vector unsigned int c, d;
+  static const __vector unsigned int float_exp_mask =
+    { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
+  static const __vector unsigned int mask =
+    { 0xffffffff, 0, 0, 0 };
+
+  a = (__vector unsigned int) vec_abs ((__v4sf)__A);
+  b = (__vector unsigned int) vec_abs ((__v4sf)__B);
+  c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
+  d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
+  c = vec_or (c, d);
+  /* Then we merge the lower float result with the original upper
+   * float elements from __A.  */
+  return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
+}
+
+/* Compare the lower SPFP values of A and B and return 1 if true
+   and 0 if false.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comieq_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] == __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comilt_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] < __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comile_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] <= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comigt_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] > __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comige_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] >= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comineq_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] != __B[0]);
+}
+
+/* FIXME
+ * The __mm_ucomi??_ss implementations below are exactly the same as
+ * __mm_comi??_ss because GCC for PowerPC only generates unordered
+ * compares (scalar and vector).
+ * Technically __mm_comieq_ss et al should be using the ordered
+ * compare and signal for QNaNs.
+ * The __mm_ucomieq_sd et all should be OK, as is.
+ */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomieq_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] == __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomilt_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] < __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomile_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] <= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomigt_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] > __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomige_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] >= __B[0]);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomineq_ss (__m128 __A, __m128 __B)
+{
+  return (__A[0] != __B[0]);
+}
+
+extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_f32 (__m128 __A)
+{
+  return ((__v4sf)__A)[0];
+}
+
+/* Convert the lower SPFP value to a 32-bit integer according to the current
+   rounding mode.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si32 (__m128 __A)
+{
+  __m64 res = 0;
+#ifdef _ARCH_PWR8
+  double dtmp;
+  __asm__(
+#ifdef __LITTLE_ENDIAN__
+      "xxsldwi %x0,%x0,%x0,3;\n"
+#endif
+      "xscvspdp %x2,%x0;\n"
+      "fctiw  %2,%2;\n"
+      "mfvsrd  %1,%x2;\n"
+      : "+wa" (__A),
+        "=r" (res),
+        "=f" (dtmp)
+      : );
+#else
+  res = __builtin_rint(__A[0]);
+#endif
+  return (res);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_ss2si (__m128 __A)
+{
+  return _mm_cvtss_si32 (__A);
+}
+
+/* Convert the lower SPFP value to a 32-bit integer according to the
+   current rounding mode.  */
+
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si64 (__m128 __A)
+{
+  __m64 res = 0;
+#ifdef _ARCH_PWR8
+  double dtmp;
+  __asm__(
+#ifdef __LITTLE_ENDIAN__
+      "xxsldwi %x0,%x0,%x0,3;\n"
+#endif
+      "xscvspdp %x2,%x0;\n"
+      "fctid  %2,%2;\n"
+      "mfvsrd  %1,%x2;\n"
+      : "+wa" (__A),
+        "=r" (res),
+        "=f" (dtmp)
+      : );
+#else
+  res = __builtin_llrint(__A[0]);
+#endif
+  return (res);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si64x (__m128 __A)
+{
+  return _mm_cvtss_si64 ((__v4sf) __A);
+}
+
+/* Constants for use with _mm_prefetch.  */
+enum _mm_hint
+{
+  /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
+  _MM_HINT_ET0 = 7,
+  _MM_HINT_ET1 = 6,
+  _MM_HINT_T0 = 3,
+  _MM_HINT_T1 = 2,
+  _MM_HINT_T2 = 1,
+  _MM_HINT_NTA = 0
+};
+
+/* Loads one cache line from address P to a location "closer" to the
+   processor.  The selector I specifies the type of prefetch operation.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_prefetch (const void *__P, enum _mm_hint __I)
+{
+  /* Current PowerPC will ignores the hint parameters.  */
+  __builtin_prefetch (__P);
+}
+
+/* Convert the two lower SPFP values to 32-bit integers according to the
+   current rounding mode.  Return the integers in packed form.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi32 (__m128 __A)
+{
+  /* Splat two lower SPFP values to both halves.  */
+  __v4sf temp, rounded;
+  __vector unsigned long long result;
+
+  /* Splat two lower SPFP values to both halves.  */
+  temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
+  rounded = vec_rint(temp);
+  result = (__vector unsigned long long) vec_cts (rounded, 0);
+
+  return (__m64) ((__vector long long) result)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_ps2pi (__m128 __A)
+{
+  return _mm_cvtps_pi32 (__A);
+}
+
+/* Truncate the lower SPFP value to a 32-bit integer.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si32 (__m128 __A)
+{
+  /* Extract the lower float element.  */
+  float temp = __A[0];
+  /* truncate to 32-bit integer and return.  */
+  return temp;
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_ss2si (__m128 __A)
+{
+  return _mm_cvttss_si32 (__A);
+}
+
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si64 (__m128 __A)
+{
+  /* Extract the lower float element.  */
+  float temp = __A[0];
+  /* truncate to 32-bit integer and return.  */
+  return temp;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si64x (__m128 __A)
+{
+  /* Extract the lower float element.  */
+  float temp = __A[0];
+  /* truncate to 32-bit integer and return.  */
+  return temp;
+}
+
+/* Truncate the two lower SPFP values to 32-bit integers.  Return the
+   integers in packed form.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttps_pi32 (__m128 __A)
+{
+  __v4sf temp;
+  __vector unsigned long long result;
+
+  /* Splat two lower SPFP values to both halves.  */
+  temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
+  result = (__vector unsigned long long) vec_cts (temp, 0);
+
+  return (__m64) ((__vector long long) result)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_ps2pi (__m128 __A)
+{
+  return _mm_cvttps_pi32 (__A);
+}
+
+/* Convert B to a SPFP value and insert it as element zero in A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_ss (__m128 __A, int __B)
+{
+  float temp = __B;
+  __A[0] = temp;
+
+  return __A;
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_si2ss (__m128 __A, int __B)
+{
+  return _mm_cvtsi32_ss (__A, __B);
+}
+
+/* Convert B to a SPFP value and insert it as element zero in A.  */
+/* Intel intrinsic.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_ss (__m128 __A, long long __B)
+{
+  float temp = __B;
+  __A[0] = temp;
+
+  return __A;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_ss (__m128 __A, long long __B)
+{
+  return _mm_cvtsi64_ss (__A, __B);
+}
+
+/* Convert the two 32-bit values in B to SPFP form and insert them
+   as the two lower elements in A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi32_ps (__m128        __A, __m64        __B)
+{
+  __vector signed int vm1;
+  __vector float vf1;
+
+  vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
+  vf1 = (__vector float) vec_ctf (vm1, 0);
+
+  return ((__m128) (__vector unsigned long long)
+    { ((__vector unsigned long long)vf1) [0],
+       ((__vector unsigned long long)__A) [1]});
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_pi2ps (__m128 __A, __m64 __B)
+{
+  return _mm_cvtpi32_ps (__A, __B);
+}
+
+/* Convert the four signed 16-bit values in A to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi16_ps (__m64 __A)
+{
+  __vector signed short vs8;
+  __vector signed int vi4;
+  __vector float vf1;
+
+  vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
+  vi4 = vec_vupklsh (vs8);
+  vf1 = (__vector float) vec_ctf (vi4, 0);
+
+  return (__m128) vf1;
+}
+
+/* Convert the four unsigned 16-bit values in A to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpu16_ps (__m64 __A)
+{
+  const __vector unsigned short zero =
+    { 0, 0, 0, 0, 0, 0, 0, 0 };
+  __vector unsigned short vs8;
+  __vector unsigned int vi4;
+  __vector float vf1;
+
+  vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
+  vi4 = (__vector unsigned int) vec_mergel
+#ifdef __LITTLE_ENDIAN__
+                                           (vs8, zero);
+#else
+                                           (zero, vs8);
+#endif
+  vf1 = (__vector float) vec_ctf (vi4, 0);
+
+  return (__m128) vf1;
+}
+
+/* Convert the low four signed 8-bit values in A to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi8_ps (__m64 __A)
+{
+  __vector signed char vc16;
+  __vector signed short vs8;
+  __vector signed int vi4;
+  __vector float vf1;
+
+  vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
+  vs8 = vec_vupkhsb (vc16);
+  vi4 = vec_vupkhsh (vs8);
+  vf1 = (__vector float) vec_ctf (vi4, 0);
+
+  return (__m128) vf1;
+}
+
+/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
+extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+_mm_cvtpu8_ps (__m64  __A)
+{
+  const __vector unsigned char zero =
+    { 0, 0, 0, 0, 0, 0, 0, 0 };
+  __vector unsigned char vc16;
+  __vector unsigned short vs8;
+  __vector unsigned int vi4;
+  __vector float vf1;
+
+  vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
+#ifdef __LITTLE_ENDIAN__
+  vs8 = (__vector unsigned short) vec_mergel (vc16, zero);
+  vi4 = (__vector unsigned int) vec_mergeh (vs8,
+                                           (__vector unsigned short) zero);
+#else
+  vs8 = (__vector unsigned short) vec_mergel (zero, vc16);
+  vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,
+                                            vs8);
+#endif
+  vf1 = (__vector float) vec_ctf (vi4, 0);
+
+  return (__m128) vf1;
+}
+
+/* Convert the four signed 32-bit values in A and B to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
+{
+  __vector signed int vi4;
+  __vector float vf4;
+
+  vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
+  vf4 = (__vector float) vec_ctf (vi4, 0);
+  return (__m128) vf4;
+}
+
+/* Convert the four SPFP values in A to four signed 16-bit integers.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi16 (__m128 __A)
+{
+  __v4sf rounded;
+  __vector signed int temp;
+  __vector unsigned long long result;
+
+  rounded = vec_rint(__A);
+  temp = vec_cts (rounded, 0);
+  result = (__vector unsigned long long) vec_pack (temp, temp);
+
+  return (__m64) ((__vector long long) result)[0];
+}
+
+/* Convert the four SPFP values in A to four signed 8-bit integers.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi8 (__m128 __A)
+{
+  __v4sf rounded;
+  __vector signed int tmp_i;
+  static const __vector signed int zero = {0, 0, 0, 0};
+  __vector signed short tmp_s;
+  __vector signed char res_v;
+
+  rounded = vec_rint(__A);
+  tmp_i = vec_cts (rounded, 0);
+  tmp_s = vec_pack (tmp_i, zero);
+  res_v = vec_pack (tmp_s, tmp_s);
+  return (__m64) ((__vector long long) res_v)[0];
+}
+
+/* Selects four specific SPFP values from A and B based on MASK.  */
+extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+_mm_shuffle_ps (__m128  __A, __m128  __B, int const __mask)
+{
+  unsigned long element_selector_10 = __mask & 0x03;
+  unsigned long element_selector_32 = (__mask >> 2) & 0x03;
+  unsigned long element_selector_54 = (__mask >> 4) & 0x03;
+  unsigned long element_selector_76 = (__mask >> 6) & 0x03;
+  static const unsigned int permute_selectors[4] =
+    {
+#ifdef __LITTLE_ENDIAN__
+      0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+#else
+      0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
+#endif
+    };
+  __vector unsigned int t;
+
+  t[0] = permute_selectors[element_selector_10];
+  t[1] = permute_selectors[element_selector_32];
+  t[2] = permute_selectors[element_selector_54] + 0x10101010;
+  t[3] = permute_selectors[element_selector_76] + 0x10101010;
+  return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
+}
+
+/* Selects and interleaves the upper two SPFP values from A and B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
+}
+
+/* Selects and interleaves the lower two SPFP values from A and B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
+}
+
+/* Sets the upper two SPFP values with 64-bits of data loaded from P;
+   the lower two values are passed through from A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadh_pi (__m128 __A, __m64 const *__P)
+{
+  __vector unsigned long long __a = (__vector unsigned long long)__A;
+  __vector unsigned long long __p = vec_splats(*__P);
+  __a [1] = __p [1];
+
+  return (__m128)__a;
+}
+
+/* Stores the upper two SPFP values of A into P.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeh_pi (__m64 *__P, __m128 __A)
+{
+  __vector unsigned long long __a = (__vector unsigned long long) __A;
+
+  *__P = __a[1];
+}
+
+/* Moves the upper two values of B into the lower two values of A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movehl_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) vec_mergel ((__vector unsigned long long)__B,
+                             (__vector unsigned long long)__A);
+}
+
+/* Moves the lower two values of B into the upper two values of A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movelh_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) vec_mergeh ((__vector unsigned long long)__A,
+                             (__vector unsigned long long)__B);
+}
+
+/* Sets the lower two SPFP values with 64-bits of data loaded from P;
+   the upper two values are passed through from A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadl_pi (__m128 __A, __m64 const *__P)
+{
+  __vector unsigned long long __a = (__vector unsigned long long)__A;
+  __vector unsigned long long __p = vec_splats(*__P);
+  __a [0] = __p [0];
+
+  return (__m128)__a;
+}
+
+/* Stores the lower two SPFP values of A into P.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storel_pi (__m64 *__P, __m128 __A)
+{
+  __vector unsigned long long __a = (__vector unsigned long long) __A;
+
+  *__P = __a[0];
+}
+
+#ifdef _ARCH_PWR8
+/* Intrinsic functions that require PowerISA 2.07 minimum.  */
+
+/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_ps (__m128  __A)
+{
+  __vector unsigned long long result;
+  static const __vector unsigned int perm_mask =
+    {
+#ifdef __LITTLE_ENDIAN__
+       0x00204060, 0x80808080, 0x80808080, 0x80808080
+#else
+      0x80808080, 0x80808080, 0x80808080, 0x00204060
+#endif
+    };
+
+  result = ((__vector unsigned long long)
+           vec_vbpermq ((__vector unsigned char) __A,
+                        (__vector unsigned char) perm_mask));
+
+#ifdef __LITTLE_ENDIAN__
+  return result[1];
+#else
+  return result[0];
+#endif
+}
+#endif /* _ARCH_PWR8 */
+
+/* Create a vector with all four elements equal to *P.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load1_ps (float const *__P)
+{
+  return _mm_set1_ps (*__P);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ps1 (float const *__P)
+{
+  return _mm_load1_ps (__P);
+}
+
+/* Extracts one of the four words of A.  The selector N must be immediate.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_pi16 (__m64 const __A, int const __N)
+{
+  unsigned int shiftr = __N & 3;
+#ifdef __BIG_ENDIAN__
+  shiftr = 3 - shiftr;
+#endif
+
+  return ((__A >> (shiftr * 16)) & 0xffff);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pextrw (__m64 const __A, int const __N)
+{
+  return _mm_extract_pi16 (__A, __N);
+}
+
+/* Inserts word D into one of four words of A.  The selector N must be
+   immediate.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
+{
+  const int shiftl = (__N & 3) * 16;
+  const __m64 shiftD = (const __m64) __D << shiftl;
+  const __m64 mask = 0xffffUL << shiftl;
+  __m64 result = (__A & (~mask)) | (shiftD & mask);
+
+  return (result);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pinsrw (__m64 const __A, int const __D, int const __N)
+{
+  return _mm_insert_pi16 (__A, __D, __N);
+}
+
+/* Compute the element-wise maximum of signed 16-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+_mm_max_pi16 (__m64 __A, __m64 __B)
+{
+#if _ARCH_PWR8
+  __vector signed short a, b, r;
+  __vector __bool short c;
+
+  a = (__vector signed short)vec_splats (__A);
+  b = (__vector signed short)vec_splats (__B);
+  c = (__vector __bool short)vec_cmpgt (a, b);
+  r = vec_sel (b, a, c);
+  return (__m64) ((__vector long long) r)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __A;
+  m2.as_m64 = __B;
+
+  res.as_short[0] =
+      (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
+  res.as_short[1] =
+      (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
+  res.as_short[2] =
+      (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
+  res.as_short[3] =
+      (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
+
+  return (__m64) res.as_m64;
+#endif
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmaxsw (__m64 __A, __m64 __B)
+{
+  return _mm_max_pi16 (__A, __B);
+}
+
+/* Compute the element-wise maximum of unsigned 8-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_pu8 (__m64 __A, __m64 __B)
+{
+#if _ARCH_PWR8
+  __vector unsigned char a, b, r;
+  __vector __bool char c;
+
+  a = (__vector unsigned char)vec_splats (__A);
+  b = (__vector unsigned char)vec_splats (__B);
+  c = (__vector __bool char)vec_cmpgt (a, b);
+  r = vec_sel (b, a, c);
+  return (__m64) ((__vector long long) r)[0];
+#else
+  __m64_union m1, m2, res;
+  long i;
+
+  m1.as_m64 = __A;
+  m2.as_m64 = __B;
+
+
+  for (i = 0; i < 8; i++)
+  res.as_char[i] =
+      ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
+         m1.as_char[i] : m2.as_char[i];
+
+  return (__m64) res.as_m64;
+#endif
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmaxub (__m64 __A, __m64 __B)
+{
+  return _mm_max_pu8 (__A, __B);
+}
+
+/* Compute the element-wise minimum of signed 16-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_pi16 (__m64 __A, __m64 __B)
+{
+#if _ARCH_PWR8
+  __vector signed short a, b, r;
+  __vector __bool short c;
+
+  a = (__vector signed short)vec_splats (__A);
+  b = (__vector signed short)vec_splats (__B);
+  c = (__vector __bool short)vec_cmplt (a, b);
+  r = vec_sel (b, a, c);
+  return (__m64) ((__vector long long) r)[0];
+#else
+  __m64_union m1, m2, res;
+
+  m1.as_m64 = __A;
+  m2.as_m64 = __B;
+
+  res.as_short[0] =
+      (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
+  res.as_short[1] =
+      (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
+  res.as_short[2] =
+      (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
+  res.as_short[3] =
+      (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
+
+  return (__m64) res.as_m64;
+#endif
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pminsw (__m64 __A, __m64 __B)
+{
+  return _mm_min_pi16 (__A, __B);
+}
+
+/* Compute the element-wise minimum of unsigned 8-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_pu8 (__m64 __A, __m64 __B)
+{
+#if _ARCH_PWR8
+  __vector unsigned char a, b, r;
+  __vector __bool char c;
+
+  a = (__vector unsigned char)vec_splats (__A);
+  b = (__vector unsigned char)vec_splats (__B);
+  c = (__vector __bool char)vec_cmplt (a, b);
+  r = vec_sel (b, a, c);
+  return (__m64) ((__vector long long) r)[0];
+#else
+  __m64_union m1, m2, res;
+  long i;
+
+  m1.as_m64 = __A;
+  m2.as_m64 = __B;
+
+
+  for (i = 0; i < 8; i++)
+  res.as_char[i] =
+      ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
+         m1.as_char[i] : m2.as_char[i];
+
+  return (__m64) res.as_m64;
+#endif
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pminub (__m64 __A, __m64 __B)
+{
+  return _mm_min_pu8 (__A, __B);
+}
+
+/* Create an 8-bit mask of the signs of 8-bit values.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_pi8 (__m64 __A)
+{
+  unsigned long long p =
+#ifdef __LITTLE_ENDIAN__
+                         0x0008101820283038UL; // permute control for sign bits
+#else
+                         0x3830282018100800UL; // permute control for sign bits
+#endif
+  return __builtin_bpermd (p, __A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmovmskb (__m64 __A)
+{
+  return _mm_movemask_pi8 (__A);
+}
+
+/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
+   in B and produce the high 16 bits of the 32-bit results.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_pu16 (__m64 __A, __m64 __B)
+{
+  __vector unsigned short a, b;
+  __vector unsigned short c;
+  __vector unsigned int w0, w1;
+  __vector unsigned char xform1 = {
+#ifdef __LITTLE_ENDIAN__
+      0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
+      0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
+#else
+      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
+      0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15
+#endif
+    };
+
+  a = (__vector unsigned short)vec_splats (__A);
+  b = (__vector unsigned short)vec_splats (__B);
+
+  w0 = vec_vmuleuh (a, b);
+  w1 = vec_vmulouh (a, b);
+  c = (__vector unsigned short)vec_perm (w0, w1, xform1);
+
+  return (__m64) ((__vector long long) c)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmulhuw (__m64 __A, __m64 __B)
+{
+  return _mm_mulhi_pu16 (__A, __B);
+}
+
+/* Return a combination of the four 16-bit values in A.  The selector
+   must be an immediate.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi16 (__m64 __A, int const __N)
+{
+  unsigned long element_selector_10 = __N & 0x03;
+  unsigned long element_selector_32 = (__N >> 2) & 0x03;
+  unsigned long element_selector_54 = (__N >> 4) & 0x03;
+  unsigned long element_selector_76 = (__N >> 6) & 0x03;
+  static const unsigned short permute_selectors[4] =
+    {
+#ifdef __LITTLE_ENDIAN__
+             0x0908, 0x0B0A, 0x0D0C, 0x0F0E
+#else
+             0x0607, 0x0405, 0x0203, 0x0001
+#endif
+    };
+  __m64_union t;
+  __vector unsigned long long a, p, r;
+
+#ifdef __LITTLE_ENDIAN__
+  t.as_short[0] = permute_selectors[element_selector_10];
+  t.as_short[1] = permute_selectors[element_selector_32];
+  t.as_short[2] = permute_selectors[element_selector_54];
+  t.as_short[3] = permute_selectors[element_selector_76];
+#else
+  t.as_short[3] = permute_selectors[element_selector_10];
+  t.as_short[2] = permute_selectors[element_selector_32];
+  t.as_short[1] = permute_selectors[element_selector_54];
+  t.as_short[0] = permute_selectors[element_selector_76];
+#endif
+  p = vec_splats (t.as_m64);
+  a = vec_splats (__A);
+  r = vec_perm (a, a, (__vector unsigned char)p);
+  return (__m64) ((__vector long long) r)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pshufw (__m64 __A, int const __N)
+{
+  return _mm_shuffle_pi16 (__A, __N);
+}
+
+/* Conditionally store byte elements of A into P.  The high bit of each
+   byte in the selector N determines whether the corresponding byte from
+   A is stored.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
+{
+  __m64 hibit = 0x8080808080808080UL;
+  __m64 mask, tmp;
+  __m64 *p = (__m64*)__P;
+
+  tmp = *p;
+  mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
+  tmp = (tmp & (~mask)) | (__A & mask);
+  *p = tmp;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_maskmovq (__m64 __A, __m64 __N, char *__P)
+{
+  _mm_maskmove_si64 (__A, __N, __P);
+}
+
+/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_pu8 (__m64 __A, __m64 __B)
+{
+  __vector unsigned char a, b, c;
+
+  a = (__vector unsigned char)vec_splats (__A);
+  b = (__vector unsigned char)vec_splats (__B);
+  c = vec_avg (a, b);
+  return (__m64) ((__vector long long) c)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pavgb (__m64 __A, __m64 __B)
+{
+  return _mm_avg_pu8 (__A, __B);
+}
+
+/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_pu16 (__m64 __A, __m64 __B)
+{
+  __vector unsigned short a, b, c;
+
+  a = (__vector unsigned short)vec_splats (__A);
+  b = (__vector unsigned short)vec_splats (__B);
+  c = vec_avg (a, b);
+  return (__m64) ((__vector long long) c)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pavgw (__m64 __A, __m64 __B)
+{
+  return _mm_avg_pu16 (__A, __B);
+}
+
+/* Compute the sum of the absolute differences of the unsigned 8-bit
+   values in A and B.  Return the value in the lower 16-bit word; the
+   upper words are cleared.  */
+extern __inline    __m64    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sad_pu8 (__m64  __A, __m64  __B)
+{
+  __vector unsigned char a, b;
+  __vector unsigned char vmin, vmax, vabsdiff;
+  __vector signed int vsum;
+  const __vector unsigned int zero =
+    { 0, 0, 0, 0 };
+  __m64_union result = {0};
+
+  a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
+  b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
+  vmin = vec_min (a, b);
+  vmax = vec_max (a, b);
+  vabsdiff = vec_sub (vmax, vmin);
+  /* Sum four groups of bytes into integers.  */
+  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
+  /* Sum across four integers with integer result.  */
+  vsum = vec_sums (vsum, (__vector signed int) zero);
+  /* The sum is in the right most 32-bits of the vector result.
+     Transfer to a GPR and truncate to 16 bits.  */
+  result.as_short[0] = vsum[3];
+  return result.as_m64;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psadbw (__m64 __A, __m64 __B)
+{
+  return _mm_sad_pu8 (__A, __B);
+}
+
+/* Stores the data in A to the address P without polluting the caches.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_pi (__m64 *__P, __m64 __A)
+{
+  /* Use the data cache block touch for store transient.  */
+  __asm__ (
+    "  dcbtstt 0,%0"
+    :
+    : "b" (__P)
+    : "memory"
+  );
+  *__P = __A;
+}
+
+/* Likewise.  The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_ps (float *__P, __m128 __A)
+{
+  /* Use the data cache block touch for store transient.  */
+  __asm__ (
+    "  dcbtstt 0,%0"
+    :
+    : "b" (__P)
+    : "memory"
+  );
+  _mm_store_ps (__P, __A);
+}
+
+/* Guarantees that every preceding store is globally visible before
+   any subsequent store.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sfence (void)
+{
+  /* Generate a light weight sync.  */
+  __atomic_thread_fence (__ATOMIC_RELEASE);
+}
+
+/* The execution of the next instruction is delayed by an implementation
+   specific amount of time.  The instruction does not modify the
+   architectural state.  This is after the pop_options pragma because
+   it does not require SSE support in the processor--the encoding is a
+   nop on processors that do not support it.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_pause (void)
+{
+  /* There is no exact match with this construct, but the following is
+     close to the desired effect.  */
+#if _ARCH_PWR8
+  /* On power8 and later processors we can depend on Program Priority
+     (PRI) and associated "very low" PPI setting.  Since we don't know
+     what PPI this thread is running at we: 1) save the current PRI
+     from the PPR SPR into a local GRP, 2) set the PRI to "very low*
+     via the special or 31,31,31 encoding. 3) issue an "isync" to
+     insure the PRI change takes effect before we execute any more
+     instructions.
+     Now we can execute a lwsync (release barrier) while we execute
+     this thread at "very low" PRI.  Finally we restore the original
+     PRI and continue execution.  */
+  unsigned long __PPR;
+
+  __asm__ volatile (
+    "  mfppr   %0;"
+    "   or 31,31,31;"
+    "   isync;"
+    "   lwsync;"
+    "   isync;"
+    "   mtppr  %0;"
+    : "=r" (__PPR)
+    :
+    : "memory"
+  );
+#else
+  /* For older processor where we may not even have Program Priority
+     controls we can only depend on Heavy Weight Sync.  */
+  __atomic_thread_fence (__ATOMIC_SEQ_CST);
+#endif
+}
+
+/* Transpose the 4x4 matrix composed of row[0-3].  */
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                      \
+do {                                                                   \
+  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);   \
+  __v4sf __t0 = vec_vmrghw (__r0, __r1);                       \
+  __v4sf __t1 = vec_vmrghw (__r2, __r3);                       \
+  __v4sf __t2 = vec_vmrglw (__r0, __r1);                       \
+  __v4sf __t3 = vec_vmrglw (__r2, __r3);                       \
+  (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0,       \
+                              (__vector long long)__t1);       \
+  (row1) = (__v4sf)vec_mergel ((__vector long long)__t0,       \
+                              (__vector long long)__t1);       \
+  (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2,       \
+                              (__vector long long)__t3);       \
+  (row3) = (__v4sf)vec_mergel ((__vector long long)__t2,       \
+                              (__vector long long)__t3);       \
+} while (0)
+
+/* For backward source compatibility.  */
+//# include <emmintrin.h>
+
+#endif /* _XMMINTRIN_H_INCLUDED */
diff --git a/test/CodeGen/ppc-mm-malloc-le.c b/test/CodeGen/ppc-mm-malloc-le.c

new file mode 100644 (file)

index 0000000..14c1b25
--- /dev/null
+++ b/test/CodeGen/ppc-mm-malloc-le.c
@@ -0,0 +1,72 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: powerpc-registered-target
+// UNSUPPORTED: !powerpc64le-
+// The stdlib.h included in mm_malloc.h references native system header
+// like: bits/libc-header-start.h or features.h, cross-compile it may
+// require installing target headers in build env, otherwise expecting
+// failures. So this test will focus on native build only.
+
+// RUN: %clang -target powerpc64le-unknown-linux-gnu -S -emit-llvm %s -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt | FileCheck %s
+
+#include <mm_malloc.h>
+
+
+void __attribute__((noinline))
+test_mm_malloc() {
+  char *buf = _mm_malloc(100, 16);
+  _mm_free(buf);
+}
+
+// CHECK-LABEL: @test_mm_malloc
+
+// CHECK: define internal i8* @_mm_malloc(i64 [[REG1:[0-9a-zA-Z_%.]+]], i64 [[REG2:[0-9a-zA-Z_%.]+]])
+// CHECK: [[REG3:[0-9a-zA-Z_%.]+]] = alloca i8*, align 8
+// CHECK: store i64 [[REG1]], i64* [[REG4:[0-9a-zA-Z_%.]+]], align 8
+// CHECK-NEXT: store i64 [[REG2]], i64* [[REG5:[0-9a-zA-Z_%.]+]], align 8
+// CHECK-NEXT: store i64 16, i64* [[REG6:[0-9a-zA-Z_%.]+]], align 8
+// CHECK-NEXT: store i64 16, i64* [[REG7:[0-9a-zA-Z_%.]+]], align 8
+// CHECK-NEXT: [[REG8:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG5]], align 8
+// CHECK-NEXT: [[REG9:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG7]], align 8
+// CHECK-NEXT: [[REG10:[0-9a-zA-Z_%.]+]] = icmp eq i64 [[REG8]], [[REG9]]
+// CHECK-NEXT: br i1 [[REG10]], label %[[REG11:[0-9a-zA-Z_%.]+]], label %[[REG12:[0-9a-zA-Z_%.]+]]
+// CHECK: [[REG11]]:
+// CHECK-NEXT: [[REG13:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG5]], align 8
+// CHECK-NEXT: [[REG14:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG6]], align 8
+// CHECK-NEXT: [[REG15:[0-9a-zA-Z_%.]+]] = icmp eq i64 [[REG13]], [[REG14]]
+// CHECK-NEXT: br i1 [[REG15]], label %[[REG16:[0-9a-zA-Z_%.]+]], label %[[REG12:[0-9a-zA-Z_%.]+]]
+// CHECK: [[REG16]]:
+// CHECK-NEXT: [[REG17:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG4]], align 8
+// CHECK-NEXT: [[REG18:[0-9a-zA-Z_%.]+]] = call noalias i8* @malloc(i64 [[REG17]])
+// CHECK-NEXT: store i8* [[REG18]], i8** [[REG3]], align 8
+// CHECK-NEXT: br label %[[REG19:[0-9a-zA-Z_%.]+]]
+// CHECK: [[REG12]]:
+// CHECK-NEXT: [[REG20:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG5]], align 8
+// CHECK-NEXT: [[REG21:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG6]], align 8
+// CHECK-NEXT: [[REG22:[0-9a-zA-Z_%.]+]] = icmp ult i64 [[REG20]], [[REG21]]
+// CHECK-NEXT: br i1 [[REG22]], label %[[REG23:[0-9a-zA-Z_%.]+]], label %[[REG24:[0-9a-zA-Z_%.]+]]
+// CHECK: [[REG23]]:
+// CHECK-NEXT: [[REG25:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG6]], align 8
+// CHECK-NEXT: store i64 [[REG25]], i64* [[REG5]], align 8
+// CHECK-NEXT: br label %[[REG12:[0-9a-zA-Z_%.]+]]4
+// CHECK: [[REG24]]:
+// CHECK-NEXT: [[REG26:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG5]], align 8
+// CHECK-NEXT: [[REG27:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG4]], align 8
+// CHECK-NEXT: [[REG28:[0-9a-zA-Z_%.]+]] = call signext i32 @posix_memalign(i8** [[REG29:[0-9a-zA-Z_%.]+]], i64 [[REG26]], i64 [[REG27]])
+// CHECK-NEXT: [[REG30:[0-9a-zA-Z_%.]+]] = icmp eq i32 [[REG28]], 0
+// CHECK-NEXT: br i1 [[REG30]], label %[[REG31:[0-9a-zA-Z_%.]+]], label %[[REG32:[0-9a-zA-Z_%.]+]]
+// CHECK: [[REG31]]:
+// CHECK-NEXT: [[REG33:[0-9a-zA-Z_%.]+]] = load i8*, i8** [[REG29]], align 8
+// CHECK-NEXT: store i8* [[REG33]], i8** [[REG3]], align 8
+// CHECK-NEXT: br label %[[REG19:[0-9a-zA-Z_%.]+]]
+// CHECK: [[REG32]]:
+// CHECK-NEXT: store i8* null, i8** [[REG3]], align 8
+// CHECK-NEXT: br label %[[REG19:[0-9a-zA-Z_%.]+]]
+// CHECK: [[REG19]]:
+// CHECK-NEXT: [[REG34:[0-9a-zA-Z_%.]+]] = load i8*, i8** [[REG3]], align 8
+// CHECK-NEXT: ret i8* [[REG34]]
+
+// CHECK: define internal void @_mm_free(i8* [[REG35:[0-9a-zA-Z_%.]+]])
+// CHECK: store i8* [[REG35]], i8** [[REG36:[0-9a-zA-Z_%.]+]], align 8
+// CHECK-NEXT: [[REG37:[0-9a-zA-Z_%.]+]] = load i8*, i8** [[REG36]], align 8
+// CHECK-NEXT: call void @free(i8* [[REG37]])
+// CHECK-NEXT: ret void
diff --git a/test/CodeGen/ppc-mm-malloc.c b/test/CodeGen/ppc-mm-malloc.c

new file mode 100644 (file)

index 0000000..b85d8d9
--- /dev/null
+++ b/test/CodeGen/ppc-mm-malloc.c
@@ -0,0 +1,72 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: powerpc-registered-target
+// UNSUPPORTED: !powerpc64-
+// The stdlib.h included in mm_malloc.h references native system header
+// like: bits/libc-header-start.h or features.h, cross-compile it may
+// require installing target headers in build env, otherwise expecting
+// failures. So this test will focus on native build only.
+
+// RUN: %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm %s -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt | FileCheck %s
+
+#include <mm_malloc.h>
+
+
+void __attribute__((noinline))
+test_mm_malloc() {
+  char *buf = _mm_malloc(100, 16);
+  _mm_free(buf);
+}
+
+// CHECK-LABEL: @test_mm_malloc
+
+// CHECK: define internal i8* @_mm_malloc(i64 [[REG1:[0-9a-zA-Z_%.]+]], i64 [[REG2:[0-9a-zA-Z_%.]+]])
+// CHECK: [[REG3:[0-9a-zA-Z_%.]+]] = alloca i8*, align 8
+// CHECK: store i64 [[REG1]], i64* [[REG4:[0-9a-zA-Z_%.]+]], align 8
+// CHECK-NEXT: store i64 [[REG2]], i64* [[REG5:[0-9a-zA-Z_%.]+]], align 8
+// CHECK-NEXT: store i64 16, i64* [[REG6:[0-9a-zA-Z_%.]+]], align 8
+// CHECK-NEXT: store i64 16, i64* [[REG7:[0-9a-zA-Z_%.]+]], align 8
+// CHECK-NEXT: [[REG8:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG5]], align 8
+// CHECK-NEXT: [[REG9:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG7]], align 8
+// CHECK-NEXT: [[REG10:[0-9a-zA-Z_%.]+]] = icmp eq i64 [[REG8]], [[REG9]]
+// CHECK-NEXT: br i1 [[REG10]], label %[[REG11:[0-9a-zA-Z_%.]+]], label %[[REG12:[0-9a-zA-Z_%.]+]]
+// CHECK: [[REG11]]:
+// CHECK-NEXT: [[REG13:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG5]], align 8
+// CHECK-NEXT: [[REG14:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG6]], align 8
+// CHECK-NEXT: [[REG15:[0-9a-zA-Z_%.]+]] = icmp eq i64 [[REG13]], [[REG14]]
+// CHECK-NEXT: br i1 [[REG15]], label %[[REG16:[0-9a-zA-Z_%.]+]], label %[[REG12:[0-9a-zA-Z_%.]+]]
+// CHECK: [[REG16]]:
+// CHECK-NEXT: [[REG17:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG4]], align 8
+// CHECK-NEXT: [[REG18:[0-9a-zA-Z_%.]+]] = call noalias i8* @malloc(i64 [[REG17]])
+// CHECK-NEXT: store i8* [[REG18]], i8** [[REG3]], align 8
+// CHECK-NEXT: br label %[[REG19:[0-9a-zA-Z_%.]+]]
+// CHECK: [[REG12]]:
+// CHECK-NEXT: [[REG20:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG5]], align 8
+// CHECK-NEXT: [[REG21:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG6]], align 8
+// CHECK-NEXT: [[REG22:[0-9a-zA-Z_%.]+]] = icmp ult i64 [[REG20]], [[REG21]]
+// CHECK-NEXT: br i1 [[REG22]], label %[[REG23:[0-9a-zA-Z_%.]+]], label %[[REG24:[0-9a-zA-Z_%.]+]]
+// CHECK: [[REG23]]:
+// CHECK-NEXT: [[REG25:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG6]], align 8
+// CHECK-NEXT: store i64 [[REG25]], i64* [[REG5]], align 8
+// CHECK-NEXT: br label %[[REG12:[0-9a-zA-Z_%.]+]]4
+// CHECK: [[REG24]]:
+// CHECK-NEXT: [[REG26:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG5]], align 8
+// CHECK-NEXT: [[REG27:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG4]], align 8
+// CHECK-NEXT: [[REG28:[0-9a-zA-Z_%.]+]] = call signext i32 @posix_memalign(i8** [[REG29:[0-9a-zA-Z_%.]+]], i64 [[REG26]], i64 [[REG27]])
+// CHECK-NEXT: [[REG30:[0-9a-zA-Z_%.]+]] = icmp eq i32 [[REG28]], 0
+// CHECK-NEXT: br i1 [[REG30]], label %[[REG31:[0-9a-zA-Z_%.]+]], label %[[REG32:[0-9a-zA-Z_%.]+]]
+// CHECK: [[REG31]]:
+// CHECK-NEXT: [[REG33:[0-9a-zA-Z_%.]+]] = load i8*, i8** [[REG29]], align 8
+// CHECK-NEXT: store i8* [[REG33]], i8** [[REG3]], align 8
+// CHECK-NEXT: br label %[[REG19:[0-9a-zA-Z_%.]+]]
+// CHECK: [[REG32]]:
+// CHECK-NEXT: store i8* null, i8** [[REG3]], align 8
+// CHECK-NEXT: br label %[[REG19:[0-9a-zA-Z_%.]+]]
+// CHECK: [[REG19]]:
+// CHECK-NEXT: [[REG34:[0-9a-zA-Z_%.]+]] = load i8*, i8** [[REG3]], align 8
+// CHECK-NEXT: ret i8* [[REG34]]
+
+// CHECK: define internal void @_mm_free(i8* [[REG35:[0-9a-zA-Z_%.]+]])
+// CHECK: store i8* [[REG35]], i8** [[REG36:[0-9a-zA-Z_%.]+]], align 8
+// CHECK-NEXT: [[REG37:[0-9a-zA-Z_%.]+]] = load i8*, i8** [[REG36]], align 8
+// CHECK-NEXT: call void @free(i8* [[REG37]])
+// CHECK-NEXT: ret void
diff --git a/test/CodeGen/ppc-mmintrin.c b/test/CodeGen/ppc-mmintrin.c

index 212a387ec35b86d7b555c89cb9672c79260e3d9e..019672863331d87dda8b7063f0ff01d7417f107c 100644 (file)
--- a/test/CodeGen/ppc-mmintrin.c
+++ b/test/CodeGen/ppc-mmintrin.c
@@ -1,12 +1,13 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
  // REQUIRES: powerpc-registered-target
  
-// RUN: %clang -S -emit-llvm -target powerpc64-gnu-linux -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s \
+// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s \
  // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK-P8,CHECK,CHECK-BE
-// RUN: %clang -S -emit-llvm -target powerpc64le-gnu-linux -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s \
+// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s \
  // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK-P8,CHECK,CHECK-LE
-// RUN: %clang -S -emit-llvm -target powerpc64-gnu-linux -mcpu=pwr9 -DNO_WARN_X86_INTRINSICS %s \
+// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr9 -DNO_WARN_X86_INTRINSICS %s \
  // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK-P9,CHECK,CHECK-BE
-// RUN: %clang -S -emit-llvm -target powerpc64le-gnu-linux -mcpu=pwr9 -DNO_WARN_X86_INTRINSICS %s \
+// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr9 -DNO_WARN_X86_INTRINSICS %s \
  // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK-P9,CHECK,CHECK-LE
  
  #include <mmintrin.h>
diff --git a/test/CodeGen/ppc-xmmintrin.c b/test/CodeGen/ppc-xmmintrin.c

new file mode 100644 (file)

index 0000000..fd81937
--- /dev/null
+++ b/test/CodeGen/ppc-xmmintrin.c
@@ -0,0 +1,2090 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: powerpc-registered-target
+
+// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-BE
+// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt | FileCheck %s --check-prefixes=CHECK,CHECK-LE
+
+#include <xmmintrin.h>
+
+__m128 res, m1, m2;
+__m64 res64, ms[2];
+float fs[4];
+int i, i2;
+long long i64;
+
+// CHECK-LE-DAG: @_mm_shuffle_pi16.permute_selectors = internal constant [4 x i16] [i16 2312, i16 2826, i16 3340, i16 3854], align 2
+// CHECK-BE-DAG: @_mm_shuffle_pi16.permute_selectors = internal constant [4 x i16] [i16 1543, i16 1029, i16 515, i16 1], align 2
+
+// CHECK-LE-DAG: @_mm_shuffle_ps.permute_selectors = internal constant [4 x i32] [i32 50462976, i32 117835012, i32 185207048, i32 252579084], align 4
+// CHECK-BE-DAG: @_mm_shuffle_ps.permute_selectors = internal constant [4 x i32] [i32 66051, i32 67438087, i32 134810123, i32 202182159], align 4
+
+void __attribute__((noinline))
+test_add() {
+  res = _mm_add_ps(m1, m2);
+  res = _mm_add_ss(m1, m2);
+}
+
+// CHECK-LABEL: @test_add
+
+// CHECK: define available_externally <4 x float> @_mm_add_ps(<4 x float> [[REG1:[0-9a-zA-Z_%.]+]], <4 x float> [[REG2:[0-9a-zA-Z_%.]+]])
+// CHECK: store <4 x float> [[REG1]], <4 x float>* [[REG3:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: store <4 x float> [[REG2]], <4 x float>* [[REG4:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG5:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG3]], align 16
+// CHECK-NEXT: [[REG6:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG4]], align 16
+// CHECK-NEXT: [[REG7:[0-9a-zA-Z_%.]+]] = fadd <4 x float> [[REG5]], [[REG6]]
+// CHECK-NEXT: ret <4 x float> [[REG7]]
+
+// CHECK: define available_externally <4 x float> @_mm_add_ss(<4 x float> [[REG8:[0-9a-zA-Z_%.]+]], <4 x float> [[REG9:[0-9a-zA-Z_%.]+]])
+// CHECK: store <4 x float> [[REG8]], <4 x float>* [[REG10:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: store <4 x float> [[REG9]], <4 x float>* [[REG11:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG12:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG10]], align 16
+// CHECK-NEXT: [[REG13:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> [[REG12]], i32 zeroext 0)
+// CHECK-NEXT: store <4 x float> [[REG13]], <4 x float>* [[REG14:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG15:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG11]], align 16
+// CHECK-NEXT: [[REG16:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> [[REG15]], i32 zeroext 0)
+// CHECK-NEXT: store <4 x float> [[REG16]], <4 x float>* [[REG17:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG18:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG14]], align 16
+// CHECK-NEXT: [[REG19:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG17]], align 16
+// CHECK-NEXT: [[REG20:[0-9a-zA-Z_%.]+]] = fadd <4 x float> [[REG18]], [[REG19]]
+// CHECK-NEXT: store <4 x float> [[REG20]], <4 x float>* [[REG21:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG22:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG10]], align 16
+// CHECK-NEXT: [[REG23:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG21]], align 16
+// CHECK-NEXT: [[REG24:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> [[REG22]], <4 x float> [[REG23]], <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG24]]
+
+void __attribute__((noinline))
+test_avg() {
+  res64 = _mm_avg_pu16(ms[0], ms[1]);
+  res64 = _mm_avg_pu8(ms[0], ms[1]);
+}
+
+// CHECK-LABEL: @test_avg
+
+// CHECK: define available_externally i64 @_mm_avg_pu16
+// CHECK: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG25:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG26:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)(i64 [[REG25]])
+// CHECK-NEXT: [[REG27:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG26]] to <8 x i16>
+// CHECK-NEXT: store <8 x i16> [[REG27]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG28:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG29:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)(i64 [[REG28]])
+// CHECK-NEXT: [[REG30:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG29]] to <8 x i16>
+// CHECK-NEXT: store <8 x i16> [[REG30]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG31:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG32:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG33:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_avg(unsigned short vector[8], unsigned short vector[8])(<8 x i16> [[REG31]], <8 x i16> [[REG32]])
+// CHECK-NEXT: store <8 x i16> [[REG33]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG34:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG35:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG34]] to <2 x i64>
+// CHECK-NEXT: [[REG36:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG35]], i32 0
+// CHECK-NEXT: ret i64 [[REG36]]
+
+// CHECK: define available_externally i64 @_mm_avg_pu8
+// CHECK: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG37:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG38:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)(i64 [[REG37]])
+// CHECK-NEXT: [[REG39:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG38]] to <16 x i8>
+// CHECK-NEXT: store <16 x i8> [[REG39]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG40:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG41:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)(i64 [[REG40]])
+// CHECK-NEXT: [[REG42:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG41]] to <16 x i8>
+// CHECK-NEXT: store <16 x i8> [[REG42]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG43:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG44:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG45:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_avg(unsigned char vector[16], unsigned char vector[16])(<16 x i8> [[REG43]], <16 x i8> [[REG44]])
+// CHECK-NEXT: store <16 x i8> [[REG45]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG46:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG47:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG46]] to <2 x i64>
+// CHECK-NEXT: [[REG48:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG47]], i32 0
+// CHECK-NEXT: ret i64 [[REG48]]
+
+void __attribute__((noinline))
+test_alt_name_avg() {
+  res64 = _m_pavgw(ms[0], ms[1]);
+  res64 = _m_pavgb(ms[0], ms[1]);
+}
+
+// CHECK-LABEL: @test_alt_name_avg
+
+// CHECK: define available_externally i64 @_m_pavgw
+// CHECK: [[REG49:[0-9a-zA-Z_%.]+]] = call i64 @_mm_avg_pu16
+// CHECK-NEXT: ret i64 [[REG49]]
+
+// CHECK: define available_externally i64 @_m_pavgb
+// CHECK: [[REG50:[0-9a-zA-Z_%.]+]] = call i64 @_mm_avg_pu8
+// CHECK-NEXT: ret i64 [[REG50]]
+
+void __attribute__((noinline))
+test_cmp() {
+  res = _mm_cmpeq_ps(m1, m2);
+  res = _mm_cmpeq_ss(m1, m2);
+  res = _mm_cmpge_ps(m1, m2);
+  res = _mm_cmpge_ss(m1, m2);
+  res = _mm_cmpgt_ps(m1, m2);
+  res = _mm_cmpgt_ss(m1, m2);
+  res = _mm_cmple_ps(m1, m2);
+  res = _mm_cmple_ss(m1, m2);
+  res = _mm_cmplt_ps(m1, m2);
+  res = _mm_cmplt_ss(m1, m2);
+  res = _mm_cmpneq_ps(m1, m2);
+  res = _mm_cmpneq_ss(m1, m2);
+  res = _mm_cmpnge_ps(m1, m2);
+  res = _mm_cmpnge_ss(m1, m2);
+  res = _mm_cmpngt_ps(m1, m2);
+  res = _mm_cmpngt_ss(m1, m2);
+  res = _mm_cmpnle_ps(m1, m2);
+  res = _mm_cmpnle_ss(m1, m2);
+  res = _mm_cmpnlt_ps(m1, m2);
+  res = _mm_cmpnlt_ss(m1, m2);
+  res = _mm_cmpord_ps(m1, m2);
+  res = _mm_cmpord_ss(m1, m2);
+  res = _mm_cmpunord_ps(m1, m2);
+  res = _mm_cmpunord_ss(m1, m2);
+}
+
+// CHECK-LABEL: @test_cmp
+
+// CHECK: define available_externally <4 x float> @_mm_cmpeq_ps
+// CHECK: [[REG51:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpeq(float vector[4], float vector[4])
+// CHECK-NEXT: [[REG52:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG51]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[REG52]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpeq_ss
+// CHECK: [[REG53:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG53]], <4 x float>* [[REG54:[0-9a-zA-Z_%.]+]], align 16
+// CHECK: [[REG55:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG55]], <4 x float>* [[REG56:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG57:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG54]], align 16
+// CHECK-NEXT: [[REG58:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG56]], align 16
+// CHECK-NEXT: call <4 x i32> @vec_cmpeq(float vector[4], float vector[4])(<4 x float> [[REG57]], <4 x float> [[REG58]])
+// CHECK: [[REG59:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG59]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpge_ps
+// CHECK: [[REG60:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpge(float vector[4], float vector[4])
+// CHECK-NEXT: [[REG61:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG60]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[REG61]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpge_ss
+// CHECK: [[REG62:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG62]], <4 x float>* [[REG63:[0-9a-zA-Z_%.]+]], align 16
+// CHECK: [[REG64:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG64]], <4 x float>* [[REG65:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG66:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG63]], align 16
+// CHECK-NEXT: [[REG67:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG65]], align 16
+// CHECK-NEXT: call <4 x i32> @vec_cmpge(float vector[4], float vector[4])(<4 x float> [[REG66]], <4 x float> [[REG67]])
+// CHECK: [[REG68:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG68]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpgt_ps
+// CHECK: [[REG69:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpgt(float vector[4], float vector[4])
+// CHECK-NEXT: [[REG70:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG69]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[REG70]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpgt_ss
+// CHECK: [[REG71:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG71]], <4 x float>* [[REG72:[0-9a-zA-Z_%.]+]], align 16
+// CHECK: [[REG73:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG73]], <4 x float>* [[REG74:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG75:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG72]], align 16
+// CHECK-NEXT: [[REG76:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG74]], align 16
+// CHECK-NEXT: call <4 x i32> @vec_cmpgt(float vector[4], float vector[4])(<4 x float> [[REG75]], <4 x float> [[REG76]])
+// CHECK: [[REG77:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG77]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmple_ps
+// CHECK: [[REG78:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmple(float vector[4], float vector[4])
+// CHECK-NEXT: [[REG79:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG78]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[REG79]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmple_ss
+// CHECK: [[REG80:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG80]], <4 x float>* [[REG81:[0-9a-zA-Z_%.]+]], align 16
+// CHECK: [[REG82:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG82]], <4 x float>* [[REG83:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG84:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG81]], align 16
+// CHECK-NEXT: [[REG85:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG83]], align 16
+// CHECK-NEXT: call <4 x i32> @vec_cmple(float vector[4], float vector[4])(<4 x float> [[REG84]], <4 x float> [[REG85]])
+// CHECK: [[REG86:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG86]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmplt_ps
+// CHECK: [[REG87:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmplt(float vector[4], float vector[4])
+// CHECK-NEXT: [[REG88:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG87]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[REG88]]
+
+// CHECK: @_mm_cmplt_ss
+// CHECK: [[REG89:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, i32 zeroext 0)
+// CHECK-NEXT: store <4 x float> [[REG89]], <4 x float>* [[REG90:[0-9a-zA-Z_%.]+]], align 16
+// CHECK: [[REG91:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, i32 zeroext 0)
+// CHECK-NEXT: store <4 x float> [[REG91]], <4 x float>* [[REG92:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG93:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG90]], align 16
+// CHECK-NEXT: [[REG94:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG92]], align 16
+// CHECK-NEXT: call <4 x i32> @vec_cmplt(float vector[4], float vector[4])(<4 x float> [[REG93]], <4 x float> [[REG94]])
+// CHECK: [[REG95:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG95]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpneq_ps
+// CHECK: [[REG96:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpeq(float vector[4], float vector[4])
+// CHECK-NEXT: [[REG97:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG96]] to <4 x float>
+// CHECK-NEXT: store <4 x float> [[REG97]], <4 x float>* [[REG98:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG99:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG98]], align 16
+// CHECK-NEXT: [[REG100:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG98]], align 16
+// CHECK-NEXT: [[REG101:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_nor(float vector[4], float vector[4])(<4 x float> [[REG99]], <4 x float> [[REG100]])
+// CHECK-NEXT: ret <4 x float> [[REG101]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpneq_ss
+// CHECK: [[REG102:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG102]], <4 x float>* [[REG103:[0-9a-zA-Z_%.]+]], align 16
+// CHECK: [[REG104:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG104]], <4 x float>* [[REG105:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG106:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG103]], align 16
+// CHECK-NEXT: [[REG107:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG105]], align 16
+// CHECK-NEXT: call <4 x i32> @vec_cmpeq(float vector[4], float vector[4])(<4 x float> [[REG106]], <4 x float> [[REG107]])
+// CHECK: call <4 x float> @vec_nor(float vector[4], float vector[4])
+// CHECK: [[REG108:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG108]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpnge_ps
+// CHECK: [[REG109:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmplt(float vector[4], float vector[4])
+// CHECK-NEXT: [[REG110:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG109]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[REG110]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpnge_ss
+// CHECK: [[REG111:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG111]], <4 x float>* [[REG112:[0-9a-zA-Z_%.]+]], align 16
+// CHECK: [[REG113:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG113]], <4 x float>* [[REG114:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG115:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG112]], align 16
+// CHECK-NEXT: [[REG116:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG114]], align 16
+// CHECK-NEXT: call <4 x i32> @vec_cmplt(float vector[4], float vector[4])(<4 x float> [[REG115]], <4 x float> [[REG116]])
+// CHECK: [[REG117:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG117]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpngt_ps
+// CHECK: [[REG118:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmple(float vector[4], float vector[4])
+// CHECK-NEXT: [[REG119:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG118]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[REG119]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpngt_ss
+// CHECK: [[REG120:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG120]], <4 x float>* [[REG121:[0-9a-zA-Z_%.]+]], align 16
+// CHECK: [[REG122:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG122]], <4 x float>* [[REG123:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG124:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG121]], align 16
+// CHECK-NEXT: [[REG125:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG123]], align 16
+// CHECK-NEXT: call <4 x i32> @vec_cmple(float vector[4], float vector[4])(<4 x float> [[REG124]], <4 x float> [[REG125]])
+// CHECK: [[REG126:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG126]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpnle_ps
+// CHECK: [[REG127:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpgt(float vector[4], float vector[4])
+// CHECK-NEXT: [[REG128:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG127]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[REG128]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpnle_ss
+// CHECK: [[REG129:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG129]], <4 x float>* [[REG130:[0-9a-zA-Z_%.]+]], align 16
+// CHECK: [[REG131:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG131]], <4 x float>* [[REG132:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG133:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG130]], align 16
+// CHECK-NEXT: [[REG134:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG132]], align 16
+// CHECK-NEXT: call <4 x i32> @vec_cmpgt(float vector[4], float vector[4])(<4 x float> [[REG133]], <4 x float> [[REG134]])
+// CHECK: [[REG135:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG135]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpnlt_ps
+// CHECK: [[REG136:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpge(float vector[4], float vector[4])
+// CHECK-NEXT: [[REG137:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG136]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[REG137]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpnlt_ss
+// CHECK: [[REG138:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG138]], <4 x float>* [[REG139:[0-9a-zA-Z_%.]+]], align 16
+// CHECK: [[REG140:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG140]], <4 x float>* [[REG141:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG142:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG139]], align 16
+// CHECK-NEXT: [[REG143:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG141]], align 16
+// CHECK-NEXT: call <4 x i32> @vec_cmpge(float vector[4], float vector[4])(<4 x float> [[REG142]], <4 x float> [[REG143]])
+// CHECK: [[REG144:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG144]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpord_ps
+// CHECK: [[REG145:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_abs(float vector[4])(<4 x float> {{[0-9a-zA-Z_%.]+}})
+// CHECK-NEXT: [[REG146:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG145]] to <4 x i32>
+// CHECK-NEXT: store <4 x i32> [[REG146]], <4 x i32>* [[REG147:[0-9a-zA-Z_%.]+]], align 16
+// CHECK: [[REG148:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_abs(float vector[4])(<4 x float> {{[0-9a-zA-Z_%.]+}})
+// CHECK-NEXT: [[REG149:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG148]] to <4 x i32>
+// CHECK-NEXT: store <4 x i32> [[REG149]], <4 x i32>* [[REG150:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG151:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG147]], align 16
+// CHECK-NEXT: [[REG152:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpgt(unsigned int vector[4], unsigned int vector[4])(<4 x i32> <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>, <4 x i32> [[REG151]])
+// CHECK-NEXT: store <4 x i32> [[REG152]], <4 x i32>* [[REG153:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG154:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG150]], align 16
+// CHECK-NEXT: [[REG155:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpgt(unsigned int vector[4], unsigned int vector[4])(<4 x i32> <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>, <4 x i32> [[REG154]])
+// CHECK-NEXT: store <4 x i32> [[REG155]], <4 x i32>* [[REG156:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG157:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG153]], align 16
+// CHECK-NEXT: [[REG158:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG156]], align 16
+// CHECK-NEXT: [[REG159:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_and(unsigned int vector[4], unsigned int vector[4])(<4 x i32> {{[0-9a-zA-Z_%.]+}}, <4 x i32> {{[0-9a-zA-Z_%.]+}})
+// CHECK-NEXT: [[REG160:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG159]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[REG160]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpord_ss
+// CHECK: [[REG161:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_abs(float vector[4])
+// CHECK-NEXT: [[REG162:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG161]] to <4 x i32>
+// CHECK-NEXT: store <4 x i32> [[REG162]], <4 x i32>* [[REG163:[0-9a-zA-Z_%.]+]], align 16
+// CHECK: [[REG164:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_abs(float vector[4])
+// CHECK-NEXT: [[REG165:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG164]] to <4 x i32>
+// CHECK-NEXT: store <4 x i32> [[REG165]], <4 x i32>* [[REG166:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG167:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG163]], align 16
+// CHECK-NEXT: [[REG168:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpgt(unsigned int vector[4], unsigned int vector[4])(<4 x i32> <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>, <4 x i32> [[REG167]])
+// CHECK-NEXT: store <4 x i32> [[REG168]], <4 x i32>* [[REG161:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG169:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG166]], align 16
+// CHECK-NEXT: [[REG170:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpgt(unsigned int vector[4], unsigned int vector[4])(<4 x i32> <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>, <4 x i32> [[REG169]])
+// CHECK-NEXT: store <4 x i32> [[REG170]], <4 x i32>* [[REG171:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG172:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG161]], align 16
+// CHECK-NEXT: [[REG173:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG171]], align 16
+// CHECK-NEXT: [[REG174:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_and(unsigned int vector[4], unsigned int vector[4])(<4 x i32> [[REG172]], <4 x i32> [[REG173]])
+// CHECK-NEXT: store <4 x i32> [[REG174]], <4 x i32>* [[REG161]], align 16
+// CHECK: [[REG175:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG161]], align 16
+// CHECK-NEXT: [[REG176:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG175]] to <4 x float>
+// CHECK-NEXT: [[REG177:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float> [[REG176]], <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG177]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpunord_ps
+// CHECK: [[REG178:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG179:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_abs(float vector[4])(<4 x float> [[REG178]])
+// CHECK-NEXT: [[REG180:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG179]] to <4 x i32>
+// CHECK-NEXT: store <4 x i32> [[REG180]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG181:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG182:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_abs(float vector[4])(<4 x float> [[REG181]])
+// CHECK-NEXT: [[REG183:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG182]] to <4 x i32>
+// CHECK-NEXT: store <4 x i32> [[REG183]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG184:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG185:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpgt(unsigned int vector[4], unsigned int vector[4])(<4 x i32> [[REG184]], <4 x i32> <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>)
+// CHECK-NEXT: store <4 x i32> [[REG185]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG186:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG187:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpgt(unsigned int vector[4], unsigned int vector[4])(<4 x i32> [[REG186]], <4 x i32> <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>)
+// CHECK-NEXT: store <4 x i32> [[REG187]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG188:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG189:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG190:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_or(unsigned int vector[4], unsigned int vector[4])(<4 x i32> [[REG188]], <4 x i32> [[REG189]])
+// CHECK-NEXT: [[REG191:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG190]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[REG191]]
+
+// CHECK: define available_externally <4 x float> @_mm_cmpunord_ss
+// CHECK: [[REG192:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG193:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_abs(float vector[4])(<4 x float> [[REG192]])
+// CHECK-NEXT: [[REG194:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG193]] to <4 x i32>
+// CHECK-NEXT: store <4 x i32> [[REG194]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG195:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG196:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_abs(float vector[4])(<4 x float> [[REG195]])
+// CHECK-NEXT: [[REG197:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG196]] to <4 x i32>
+// CHECK-NEXT: store <4 x i32> [[REG197]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG198:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG199:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpgt(unsigned int vector[4], unsigned int vector[4])(<4 x i32> [[REG198]], <4 x i32> <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>)
+// CHECK-NEXT: store <4 x i32> [[REG199]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG200:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG201:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpgt(unsigned int vector[4], unsigned int vector[4])(<4 x i32> [[REG200]], <4 x i32> <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>)
+// CHECK-NEXT: store <4 x i32> [[REG201]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG202:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG203:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG204:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_or(unsigned int vector[4], unsigned int vector[4])(<4 x i32> [[REG202]], <4 x i32> [[REG203]])
+// CHECK-NEXT: store <4 x i32> [[REG204]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG205:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG206:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG207:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG206]] to <4 x float>
+// CHECK-NEXT: [[REG208:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> [[REG205]], <4 x float> [[REG207]], <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG208]]
+
+void __attribute__((noinline))
+test_comi() {
+  i = _mm_comieq_ss(m1, m2);
+  i = _mm_comige_ss(m1, m2);
+  i = _mm_comigt_ss(m1, m2);
+  i = _mm_comile_ss(m1, m2);
+  i = _mm_comilt_ss(m1, m2);
+  i = _mm_comineq_ss(m1, m2);
+}
+
+// CHECK-LABEL: @test_comi
+
+// CHECK: define available_externally signext i32 @_mm_comieq_ss
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG209:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG210:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG209]], i32 0
+// CHECK-NEXT: [[REG211:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG212:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG211]], i32 0
+// CHECK-NEXT: [[REG213:[0-9a-zA-Z_%.]+]] = fcmp oeq float [[REG210]], [[REG212]]
+// CHECK-NEXT: [[REG214:[0-9a-zA-Z_%.]+]] = zext i1 [[REG213]] to i32
+// CHECK-NEXT: ret i32 [[REG214]]
+
+// CHECK: define available_externally signext i32 @_mm_comige_ss
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG215:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG216:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG215]], i32 0
+// CHECK-NEXT: [[REG217:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG218:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG217]], i32 0
+// CHECK-NEXT: [[REG219:[0-9a-zA-Z_%.]+]] = fcmp oge float [[REG216]], [[REG218]]
+// CHECK-NEXT: [[REG220:[0-9a-zA-Z_%.]+]] = zext i1 [[REG219]] to i32
+// CHECK-NEXT: ret i32 [[REG220]]
+
+// CHECK: define available_externally signext i32 @_mm_comigt_ss
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG221:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG222:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG221]], i32 0
+// CHECK-NEXT: [[REG223:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG224:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG223]], i32 0
+// CHECK-NEXT: [[REG225:[0-9a-zA-Z_%.]+]] = fcmp ogt float [[REG222]], [[REG224]]
+// CHECK-NEXT: [[REG226:[0-9a-zA-Z_%.]+]] = zext i1 [[REG225]] to i32
+// CHECK-NEXT: ret i32 [[REG226]]
+
+// CHECK: define available_externally signext i32 @_mm_comile_ss
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG227:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG228:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG227]], i32 0
+// CHECK-NEXT: [[REG229:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG230:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG229]], i32 0
+// CHECK-NEXT: [[REG231:[0-9a-zA-Z_%.]+]] = fcmp ole float [[REG228]], [[REG230]]
+// CHECK-NEXT: [[REG232:[0-9a-zA-Z_%.]+]] = zext i1 [[REG231]] to i32
+// CHECK-NEXT: ret i32 [[REG232]]
+
+// CHECK: define available_externally signext i32 @_mm_comilt_ss
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG233:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG234:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG233]], i32 0
+// CHECK-NEXT: [[REG235:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG236:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG235]], i32 0
+// CHECK-NEXT: [[REG237:[0-9a-zA-Z_%.]+]] = fcmp olt float [[REG234]], [[REG236]]
+// CHECK-NEXT: [[REG238:[0-9a-zA-Z_%.]+]] = zext i1 [[REG237]] to i32
+// CHECK-NEXT: ret i32 [[REG238]]
+
+// CHECK: define available_externally signext i32 @_mm_comineq_ss
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG239:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG240:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG239]], i32 0
+// CHECK-NEXT: [[REG241:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG242:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG241]], i32 0
+// CHECK-NEXT: [[REG243:[0-9a-zA-Z_%.]+]] = fcmp une float [[REG240]], [[REG242]]
+// CHECK-NEXT: [[REG244:[0-9a-zA-Z_%.]+]] = zext i1 [[REG243]] to i32
+// CHECK-NEXT: ret i32 [[REG244]]
+
+void __attribute__((noinline))
+test_convert() {
+  res = _mm_cvt_pi2ps(m1, ms[1]);
+  res64 = _mm_cvt_ps2pi(m1);
+  res = _mm_cvt_si2ss(m1, i);
+  i = _mm_cvt_ss2si(m1);
+  res = _mm_cvtpi16_ps(ms[0]);
+  res = _mm_cvtpi32_ps(m1, ms[1]);
+  res = _mm_cvtpi32x2_ps(ms[0], ms[1]);
+  res = _mm_cvtpi8_ps(ms[0]);
+  res64 = _mm_cvtps_pi16(m1);
+  res64 = _mm_cvtps_pi32(m1);
+  res64 = _mm_cvtps_pi8(m1);
+  res = _mm_cvtpu16_ps(ms[0]);
+  res = _mm_cvtpu8_ps(ms[0]);
+  res = _mm_cvtsi32_ss(m1, i);
+  res = _mm_cvtsi64_ss(m1, i64);
+  fs[0] = _mm_cvtss_f32(m1);
+  i = _mm_cvtss_si32(m1);
+  i64 = _mm_cvtss_si64(m1);
+  res64 = _mm_cvtt_ps2pi(m1);
+  i = _mm_cvtt_ss2si(m1);
+  res64 = _mm_cvttps_pi32(m1);
+  i = _mm_cvttss_si32(m1);
+  i64 = _mm_cvttss_si64(m1);
+}
+
+// CHECK-LABEL: @test_convert
+
+// CHECK: define available_externally <4 x float> @_mm_cvt_pi2ps
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG245:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG246:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG247:[0-9a-zA-Z_%.]+]] = call <4 x float> @_mm_cvtpi32_ps(<4 x float> [[REG245]], i64 [[REG246]])
+// CHECK-NEXT: ret <4 x float> [[REG247]]
+
+// CHECK: define available_externally i64 @_mm_cvt_ps2pi
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG248:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG249:[0-9a-zA-Z_%.]+]] = call i64 @_mm_cvtps_pi32(<4 x float> [[REG248]])
+// CHECK-NEXT: ret i64 [[REG249]]
+
+// CHECK: define available_externally <4 x float> @_mm_cvt_si2ss
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store i32 {{[0-9a-zA-Z_%.]+}}, i32* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG250:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG251:[0-9a-zA-Z_%.]+]] = load i32, i32* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG252:[0-9a-zA-Z_%.]+]] = call <4 x float> @_mm_cvtsi32_ss(<4 x float> [[REG250]], i32 signext [[REG251]])
+// CHECK-NEXT: ret <4 x float> [[REG252]]
+
+// CHECK: define available_externally signext i32 @_mm_cvt_ss2si
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG253:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG254:[0-9a-zA-Z_%.]+]] = call signext i32 @_mm_cvtss_si32(<4 x float> [[REG253]])
+// CHECK-NEXT: ret i32 [[REG254]]
+
+// CHECK: define available_externally <4 x float> @_mm_cvtpi16_ps
+// CHECK: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG255:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG256:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG255]], i32 0
+// CHECK-NEXT: [[REG257:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG258:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG256]], i64 [[REG257]], i32 1
+// CHECK-NEXT: store <2 x i64> [[REG258]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG259:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG260:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG259]] to <8 x i16>
+// CHECK-NEXT: store <8 x i16> [[REG260]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG261:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG262:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_vupklsh(short vector[8])(<8 x i16> [[REG261]])
+// CHECK-NEXT: store <4 x i32> [[REG262]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG263:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG264:[0-9a-zA-Z_%.]+]] = call <4 x float> @llvm.ppc.altivec.vcfsx(<4 x i32> [[REG263]], i32 0)
+// CHECK-NEXT: store <4 x float> [[REG264]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG265:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: ret <4 x float> [[REG265]]
+
+// CHECK: define available_externally <4 x float> @_mm_cvtpi32_ps
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG266:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG267:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG266]], i32 0
+// CHECK-NEXT: [[REG268:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG269:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG267]], i64 [[REG268]], i32 1
+// CHECK-NEXT: store <2 x i64> [[REG269]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG270:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG271:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG270]] to <4 x i32>
+// CHECK-NEXT: store <4 x i32> [[REG271]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG272:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG273:[0-9a-zA-Z_%.]+]] = call <4 x float> @llvm.ppc.altivec.vcfsx(<4 x i32> [[REG272]], i32 0)
+// CHECK-NEXT: store <4 x float> [[REG273]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG274:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG275:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG274]] to <2 x i64>
+// CHECK-NEXT: [[REG276:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG275]], i32 0
+// CHECK-NEXT: [[REG277:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG276]], i32 0
+// CHECK-NEXT: [[REG278:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG279:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG278]] to <2 x i64>
+// CHECK-NEXT: [[REG280:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG279]], i32 1
+// CHECK-NEXT: [[REG281:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG277]], i64 [[REG280]], i32 1
+// CHECK-NEXT: store <2 x i64> [[REG281]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG282:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG283:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG282]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[REG283]]
+
+// CHECK: define available_externally <4 x float> @_mm_cvtpi32x2_ps
+// CHECK: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG284:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG285:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG284]], i32 0
+// CHECK-NEXT: [[REG286:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG287:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG285]], i64 [[REG286]], i32 1
+// CHECK-NEXT: store <2 x i64> [[REG287]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG288:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG289:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG288]] to <4 x i32>
+// CHECK-NEXT: store <4 x i32> [[REG289]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG290:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG291:[0-9a-zA-Z_%.]+]] = call <4 x float> @llvm.ppc.altivec.vcfsx(<4 x i32> [[REG290]], i32 0)
+// CHECK-NEXT: store <4 x float> [[REG291]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG292:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: ret <4 x float> [[REG292]]
+
+// CHECK: define available_externally <4 x float> @_mm_cvtpi8_ps
+// CHECK: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG293:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG294:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG293]], i32 0
+// CHECK-NEXT: [[REG295:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG296:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG294]], i64 [[REG295]], i32 1
+// CHECK-NEXT: store <2 x i64> [[REG296]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG297:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG298:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG297]] to <16 x i8>
+// CHECK-NEXT: store <16 x i8> [[REG298]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG299:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG300:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_vupkhsb(signed char vector[16])(<16 x i8> [[REG299]])
+// CHECK-NEXT: store <8 x i16> [[REG300]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG301:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG302:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_vupkhsh(short vector[8])(<8 x i16> [[REG301]])
+// CHECK-NEXT: store <4 x i32> [[REG302]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG303:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG304:[0-9a-zA-Z_%.]+]] = call <4 x float> @llvm.ppc.altivec.vcfsx(<4 x i32> [[REG303]], i32 0)
+// CHECK-NEXT: store <4 x float> [[REG304]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG305:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: ret <4 x float> [[REG305]]
+
+// CHECK: define available_externally i64 @_mm_cvtps_pi16
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG306:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG307:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_rint(float vector[4])(<4 x float> [[REG306]])
+// CHECK-NEXT: store <4 x float> [[REG307]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG308:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG309:[0-9a-zA-Z_%.]+]] = call <4 x i32> @llvm.ppc.altivec.vctsxs(<4 x float> [[REG308]], i32 0)
+// CHECK-NEXT: store <4 x i32> [[REG309]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG310:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG311:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG312:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_pack(int vector[4], int vector[4])(<4 x i32> [[REG310]], <4 x i32> [[REG311]])
+// CHECK-NEXT: [[REG313:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG312]] to <2 x i64>
+// CHECK-NEXT: store <2 x i64> [[REG313]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG314:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG315:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG314]], i32 0
+// CHECK-NEXT: ret i64 [[REG315]]
+
+// CHECK: define available_externally i64 @_mm_cvtps_pi32
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG316:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG317:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG316]] to <2 x i64>
+// CHECK-NEXT: [[REG318:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splat(long long vector[2], unsigned int)(<2 x i64> [[REG317]], i32 zeroext 0)
+// CHECK-NEXT: [[REG319:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG318]] to <4 x float>
+// CHECK-NEXT: store <4 x float> [[REG319]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG320:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG321:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_rint(float vector[4])(<4 x float> [[REG320]])
+// CHECK-NEXT: store <4 x float> [[REG321]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG322:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG323:[0-9a-zA-Z_%.]+]] = call <4 x i32> @llvm.ppc.altivec.vctsxs(<4 x float> [[REG322]], i32 0)
+// CHECK-NEXT: [[REG324:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG323]] to <2 x i64>
+// CHECK-NEXT: store <2 x i64> [[REG324]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG325:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG326:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG325]], i32 0
+// CHECK-NEXT: ret i64 [[REG326]]
+
+// CHECK: define available_externally i64 @_mm_cvtps_pi8
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG327:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG328:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_rint(float vector[4])(<4 x float> [[REG327]])
+// CHECK-NEXT: store <4 x float> [[REG328]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG329:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG330:[0-9a-zA-Z_%.]+]] = call <4 x i32> @llvm.ppc.altivec.vctsxs(<4 x float> [[REG329]], i32 0)
+// CHECK-NEXT: store <4 x i32> [[REG330]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG331:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG332:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_pack(int vector[4], int vector[4])(<4 x i32> [[REG331]], <4 x i32> zeroinitializer)
+// CHECK-NEXT: store <8 x i16> [[REG332]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG333:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG334:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG335:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_pack(short vector[8], short vector[8])(<8 x i16> [[REG333]], <8 x i16> [[REG334]])
+// CHECK-NEXT: store <16 x i8> [[REG335]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG336:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG337:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG336]] to <2 x i64>
+// CHECK-NEXT: [[REG338:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG337]], i32 0
+// CHECK-NEXT: ret i64 [[REG338]]
+
+// CHECK: define available_externally <4 x float> @_mm_cvtpu16_ps
+// CHECK: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store <8 x i16> zeroinitializer, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG339:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG340:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG339]], i32 0
+// CHECK-NEXT: [[REG341:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG342:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG340]], i64 [[REG341]], i32 1
+// CHECK-NEXT: store <2 x i64> [[REG342]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG343:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG344:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG343]] to <8 x i16>
+// CHECK-NEXT: store <8 x i16> [[REG344]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG345:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-LE-NEXT: [[REG346:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_mergel(unsigned short vector[8], unsigned short vector[8])(<8 x i16> [[REG345]], <8 x i16> zeroinitializer)
+// CHECK-BE-NEXT: [[REG346:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_mergel(unsigned short vector[8], unsigned short vector[8])(<8 x i16> zeroinitializer, <8 x i16> [[REG345]])
+// CHECK-NEXT: [[REG347:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG346]] to <4 x i32>
+// CHECK-NEXT: store <4 x i32> [[REG347]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG348:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG349:[0-9a-zA-Z_%.]+]] = call <4 x float> @llvm.ppc.altivec.vcfux(<4 x i32> [[REG348]], i32 0)
+// CHECK-NEXT: store <4 x float> [[REG349]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG350:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: ret <4 x float> [[REG350]]
+
+// CHECK: define available_externally <4 x float> @_mm_cvtpu8_ps
+// CHECK: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store <16 x i8> zeroinitializer, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG351:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG352:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> undef, i64 [[REG351]], i32 0
+// CHECK-NEXT: [[REG353:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG354:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG352]], i64 [[REG353]], i32 1
+// CHECK-NEXT: store <2 x i64> [[REG354]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG355:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG356:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG355]] to <16 x i8>
+// CHECK-NEXT: store <16 x i8> [[REG356]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG357:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-LE-NEXT: [[REG358:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_mergel(unsigned char vector[16], unsigned char vector[16])(<16 x i8> [[REG357]], <16 x i8> zeroinitializer)
+// CHECK-BE-NEXT: [[REG358:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_mergel(unsigned char vector[16], unsigned char vector[16])(<16 x i8> zeroinitializer, <16 x i8> [[REG357]])
+// CHECK-NEXT: [[REG359:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG358]] to <8 x i16>
+// CHECK-NEXT: store <8 x i16> [[REG359]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG360:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-LE-NEXT: [[REG361:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> [[REG360]], <8 x i16> zeroinitializer)
+// CHECK-BE-NEXT: [[REG361:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> zeroinitializer, <8 x i16> [[REG360]])
+// CHECK-NEXT: [[REG362:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG361]] to <4 x i32>
+// CHECK-NEXT: store <4 x i32> [[REG362]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG363:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG364:[0-9a-zA-Z_%.]+]] = call <4 x float> @llvm.ppc.altivec.vcfux(<4 x i32> [[REG363]], i32 0)
+// CHECK-NEXT: store <4 x float> [[REG364]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG365:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: ret <4 x float> [[REG365]]
+
+// CHECK: define available_externally <4 x float> @_mm_cvtsi32_ss
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store i32 {{[0-9a-zA-Z_%.]+}}, i32* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG366:[0-9a-zA-Z_%.]+]] = load i32, i32* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG367:[0-9a-zA-Z_%.]+]] = sitofp i32 [[REG366]] to float
+// CHECK-NEXT: store float [[REG367]], float* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG368:[0-9a-zA-Z_%.]+]] = load float, float* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG369:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG370:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> [[REG369]], float [[REG368]], i32 0
+// CHECK-NEXT: store <4 x float> [[REG370]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG371:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: ret <4 x float> [[REG371]]
+
+// CHECK: define available_externally <4 x float> @_mm_cvtsi64_ss
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG372:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG373:[0-9a-zA-Z_%.]+]] = sitofp i64 [[REG372]] to float
+// CHECK-NEXT: store float [[REG373]], float* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG374:[0-9a-zA-Z_%.]+]] = load float, float* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG375:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG376:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> [[REG375]], float [[REG374]], i32 0
+// CHECK-NEXT: store <4 x float> [[REG376]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG377:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: ret <4 x float> [[REG377]]
+
+// CHECK: define available_externally float @_mm_cvtss_f32
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG378:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG379:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG378]], i32 0
+// CHECK-NEXT: ret float [[REG379]]
+
+// CHECK: define available_externally signext i32 @_mm_cvtss_si32
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store i64 0, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG380:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-LE-NEXT: [[REG381:[0-9a-zA-Z_%.]+]] = call { <4 x float>, i64, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctiw  $2,$2;\0Amfvsrd  $1,${2:x};\0A", "=^wa,=r,=f,0"(<4 x float> [[REG380]])
+// CHECK-BE-NEXT: [[REG381:[0-9a-zA-Z_%.]+]] = call { <4 x float>, i64, double } asm "xscvspdp ${2:x},${0:x};\0Afctiw  $2,$2;\0Amfvsrd  $1,${2:x};\0A", "=^wa,=r,=f,0"(<4 x float> [[REG380]])
+// CHECK-NEXT: [[REG382:[0-9a-zA-Z_%.]+]] = extractvalue { <4 x float>, i64, double } [[REG381]], 0
+// CHECK-NEXT: [[REG383:[0-9a-zA-Z_%.]+]] = extractvalue { <4 x float>, i64, double } [[REG381]], 1
+// CHECK-NEXT: [[REG384:[0-9a-zA-Z_%.]+]] = extractvalue { <4 x float>, i64, double } [[REG381]], 2
+// CHECK-NEXT: store <4 x float> [[REG382]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store i64 [[REG383]], i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store double [[REG384]], double* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG385:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG386:[0-9a-zA-Z_%.]+]] = trunc i64 [[REG385]] to i32
+// CHECK-NEXT: ret i32 [[REG386]]
+
+// CHECK: define available_externally i64 @_mm_cvtss_si64
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store i64 0, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG387:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-LE-NEXT: [[REG388:[0-9a-zA-Z_%.]+]] = call { <4 x float>, i64, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctid  $2,$2;\0Amfvsrd  $1,${2:x};\0A", "=^wa,=r,=f,0"(<4 x float> [[REG387]])
+// CHECK-BE-NEXT: [[REG388:[0-9a-zA-Z_%.]+]] = call { <4 x float>, i64, double } asm "xscvspdp ${2:x},${0:x};\0Afctid  $2,$2;\0Amfvsrd  $1,${2:x};\0A", "=^wa,=r,=f,0"(<4 x float> [[REG387]])
+// CHECK-NEXT: [[REG389:[0-9a-zA-Z_%.]+]] = extractvalue { <4 x float>, i64, double } [[REG388]], 0
+// CHECK-NEXT: [[REG390:[0-9a-zA-Z_%.]+]] = extractvalue { <4 x float>, i64, double } [[REG388]], 1
+// CHECK-NEXT: [[REG391:[0-9a-zA-Z_%.]+]] = extractvalue { <4 x float>, i64, double } [[REG388]], 2
+// CHECK-NEXT: store <4 x float> [[REG389]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store i64 [[REG390]], i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store double [[REG391]], double* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG392:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: ret i64 [[REG392]]
+
+// CHECK: define available_externally i64 @_mm_cvtt_ps2pi
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK: [[REG393:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG394:[0-9a-zA-Z_%.]+]] = call i64 @_mm_cvttps_pi32(<4 x float> [[REG393]])
+// CHECK-NEXT: ret i64 [[REG394]]
+
+// CHECK: define available_externally signext i32 @_mm_cvtt_ss2si
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG395:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG396:[0-9a-zA-Z_%.]+]] = call signext i32 @_mm_cvttss_si32(<4 x float> [[REG395]])
+// CHECK-NEXT: ret i32 [[REG396]]
+
+// CHECK: define available_externally i64 @_mm_cvttps_pi32
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG397:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG398:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG397]] to <2 x i64>
+// CHECK-NEXT: [[REG399:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splat(long long vector[2], unsigned int)(<2 x i64> [[REG398]], i32 zeroext 0)
+// CHECK-NEXT: [[REG400:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG399]] to <4 x float>
+// CHECK-NEXT: store <4 x float> [[REG400]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG401:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG402:[0-9a-zA-Z_%.]+]] = call <4 x i32> @llvm.ppc.altivec.vctsxs(<4 x float> [[REG401]], i32 0)
+// CHECK-NEXT: [[REG403:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG402]] to <2 x i64>
+// CHECK-NEXT: store <2 x i64> [[REG403]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG404:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG405:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG404]], i32 0
+// CHECK-NEXT: ret i64 [[REG405]]
+
+// CHECK: define available_externally signext i32 @_mm_cvttss_si32
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG406:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG407:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG406]], i32 0
+// CHECK-NEXT: store float [[REG407]], float* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG408:[0-9a-zA-Z_%.]+]] = load float, float* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG409:[0-9a-zA-Z_%.]+]] = fptosi float [[REG408]] to i32
+// CHECK-NEXT: ret i32 [[REG409]]
+
+// CHECK: define available_externally i64 @_mm_cvttss_si64
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG410:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG411:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG410]], i32 0
+// CHECK-NEXT: store float [[REG411]], float* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG412:[0-9a-zA-Z_%.]+]] = load float, float* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG413:[0-9a-zA-Z_%.]+]] = fptosi float [[REG412]] to i64
+// CHECK-NEXT: ret i64 [[REG413]]
+
+void __attribute__((noinline))
+test_div() {
+  res = _mm_div_ps(m1, m2);
+  res = _mm_div_ss(m1, m2);
+}
+
+// CHECK-LABEL: @test_div
+
+// CHECK: define available_externally <4 x float> @_mm_div_ps(<4 x float> [[REG414:[0-9a-zA-Z_%.]+]], <4 x float> [[REG415:[0-9a-zA-Z_%.]+]])
+// CHECK: store <4 x float> [[REG414]], <4 x float>* [[REG416:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: store <4 x float> [[REG415]], <4 x float>* [[REG417:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG418:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG416]], align 16
+// CHECK-NEXT: [[REG419:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG417]], align 16
+// CHECK-NEXT: [[REG420:[0-9a-zA-Z_%.]+]] = fdiv <4 x float> [[REG418]], [[REG419]]
+// CHECK-NEXT: ret <4 x float> [[REG420]]
+
+// CHECK: define available_externally <4 x float> @_mm_div_ss(<4 x float> [[REG421:[0-9a-zA-Z_%.]+]], <4 x float> [[REG422:[0-9a-zA-Z_%.]+]])
+// CHECK: store <4 x float> [[REG421]], <4 x float>* [[REG423:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: store <4 x float> [[REG422]], <4 x float>* [[REG424:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG425:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG423]], align 16
+// CHECK-NEXT: [[REG426:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> [[REG425]], i32 zeroext 0)
+// CHECK-NEXT: store <4 x float> [[REG426]], <4 x float>* [[REG427:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG428:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG424]], align 16
+// CHECK-NEXT: [[REG429:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> [[REG428]], i32 zeroext 0)
+// CHECK-NEXT: store <4 x float> [[REG429]], <4 x float>* [[REG430:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG431:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG427]], align 16
+// CHECK-NEXT: [[REG432:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG430]], align 16
+// CHECK-NEXT: [[REG433:[0-9a-zA-Z_%.]+]] = fdiv <4 x float> [[REG431]], [[REG432]]
+// CHECK-NEXT: store <4 x float> [[REG433]], <4 x float>* [[REG434:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG435:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG423]], align 16
+// CHECK-NEXT: [[REG436:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG434]], align 16
+// CHECK-NEXT: [[REG437:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> [[REG435]], <4 x float> [[REG436]], <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG437]]
+
+void __attribute__((noinline))
+test_extract() {
+  i = _mm_extract_pi16(ms[0], i2);
+  i = _m_pextrw(ms[0], i2);
+}
+
+// CHECK-LABEL: @test_extract
+
+// CHECK: define available_externally signext i32 @_mm_extract_pi16
+// CHECK: [[REG438:[0-9a-zA-Z_%.]+]] = load i32, i32* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG439:[0-9a-zA-Z_%.]+]] = and i32 [[REG438]], 3
+// CHECK-NEXT: store i32 [[REG439]], i32* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-BE: sub i32 3, {{[0-9a-zA-Z_%.]+}}
+// CHECK: [[REG440:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK: [[REG441:[0-9a-zA-Z_%.]+]] = load i32, i32* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK: [[REG442:[0-9a-zA-Z_%.]+]] = mul i32 [[REG441]], 16
+// CHECK: [[REG443:[0-9a-zA-Z_%.]+]] = zext i32 [[REG442]] to i64
+// CHECK-NEXT: [[REG444:[0-9a-zA-Z_%.]+]] = lshr i64 [[REG440]], [[REG443]]
+// CHECK-NEXT: [[REG445:[0-9a-zA-Z_%.]+]] = and i64 [[REG444]], 65535
+// CHECK-NEXT: [[REG446:[0-9a-zA-Z_%.]+]] = trunc i64 [[REG445]] to i32
+// CHECK-NEXT: ret i32 [[REG446]]
+
+// CHECK: define available_externally signext i32 @_m_pextrw
+// CHECK: [[REG447:[0-9a-zA-Z_%.]+]] = call signext i32 @_mm_extract_pi16
+// CHECK-NEXT: ret i32 [[REG447]]
+
+void __attribute__((noinline))
+test_insert() {
+  res64 = _mm_insert_pi16(ms[0], i, i2);
+  res64 = _m_pinsrw(ms[0], i, i2);
+}
+
+// CHECK-LABEL: @test_insert
+
+// CHECK: define available_externally i64 @_mm_insert_pi16
+// CHECK: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store i32 {{[0-9a-zA-Z_%.]+}}, i32* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: store i32 {{[0-9a-zA-Z_%.]+}}, i32* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG448:[0-9a-zA-Z_%.]+]] = load i32, i32* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG449:[0-9a-zA-Z_%.]+]] = and i32 [[REG448]], 3
+// CHECK-NEXT: [[REG450:[0-9a-zA-Z_%.]+]] = mul nsw i32 [[REG449]], 16
+// CHECK-NEXT: store i32 [[REG450]], i32* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG451:[0-9a-zA-Z_%.]+]] = load i32, i32* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG452:[0-9a-zA-Z_%.]+]] = sext i32 [[REG451]] to i64
+// CHECK-NEXT: [[REG453:[0-9a-zA-Z_%.]+]] = load i32, i32* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG454:[0-9a-zA-Z_%.]+]] = zext i32 [[REG453]] to i64
+// CHECK-NEXT: [[REG455:[0-9a-zA-Z_%.]+]] = shl i64 [[REG452]], [[REG454]]
+// CHECK-NEXT: store i64 [[REG455]], i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG456:[0-9a-zA-Z_%.]+]] = load i32, i32* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG457:[0-9a-zA-Z_%.]+]] = zext i32 [[REG456]] to i64
+// CHECK-NEXT: [[REG458:[0-9a-zA-Z_%.]+]] = shl i64 65535, [[REG457]]
+// CHECK-NEXT: store i64 [[REG458]], i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG459:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG460:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG461:[0-9a-zA-Z_%.]+]] = xor i64 [[REG460]], -1
+// CHECK-NEXT: [[REG462:[0-9a-zA-Z_%.]+]] = and i64 [[REG459]], [[REG461]]
+// CHECK-NEXT: [[REG463:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG464:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG465:[0-9a-zA-Z_%.]+]] = and i64 [[REG463]], [[REG464]]
+// CHECK-NEXT: [[REG466:[0-9a-zA-Z_%.]+]] = or i64 [[REG462]], [[REG465]]
+// CHECK-NEXT: store i64 [[REG466]], i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG467:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: ret i64 [[REG467]]
+
+// CHECK: define available_externally i64 @_m_pinsrw
+// CHECK: [[REG468:[0-9a-zA-Z_%.]+]] = call i64 @_mm_insert_pi16
+// CHECK-NEXT: ret i64 [[REG468]]
+
+void __attribute__((noinline))
+test_load() {
+  res = _mm_load_ps(fs);
+  res = _mm_load_ps1(fs);
+  res = _mm_load_ss(fs);
+  res = _mm_load1_ps(fs);
+  res = _mm_loadh_pi(m1, &ms[0]);
+  res = _mm_loadl_pi(m1, &ms[0]);
+  res = _mm_loadr_ps(fs);
+  res = _mm_loadu_ps(fs);
+}
+
+// CHECK-LABEL: @test_load
+
+// CHECK: define available_externally <4 x float> @_mm_load_ps
+// CHECK: [[REG469:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_ld(int, float vector[4] const*)
+// CHECK-NEXT: ret <4 x float> [[REG469]]
+
+// CHECK: define available_externally <4 x float> @_mm_load_ps1
+// CHECK: [[REG470:[0-9a-zA-Z_%.]+]] = call <4 x float> @_mm_load1_ps
+// CHECK-NEXT: ret <4 x float> [[REG470]]
+
+// CHECK: define available_externally <4 x float> @_mm_load_ss
+// CHECK: [[REG471:[0-9a-zA-Z_%.]+]] = call <4 x float> @_mm_set_ss
+// CHECK-NEXT: ret <4 x float> [[REG471]]
+
+// CHECK: define available_externally <4 x float> @_mm_load1_ps
+// CHECK: [[REG472:[0-9a-zA-Z_%.]+]] = call <4 x float> @_mm_set1_ps
+// CHECK-NEXT: ret <4 x float> [[REG472]]
+
+// CHECK: define available_externally <4 x float> @_mm_loadh_pi
+// CHECK: [[REG473:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)
+// CHECK-NEXT: store <2 x i64> [[REG473]], <2 x i64>* [[REG474:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG475:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG474]], align 16
+// CHECK-NEXT: [[REG476:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG475]], i32 1
+// CHECK-NEXT: [[REG477:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG478:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG479:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG477]], i64 [[REG476]], i32 1
+// CHECK-NEXT: store <2 x i64> [[REG479]], <2 x i64>* [[REG478]], align 16
+// CHECK-NEXT: [[REG480:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG478]], align 16
+// CHECK-NEXT: [[REG481:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG480]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[REG481]]
+
+// CHECK: define available_externally <4 x float> @_mm_loadl_pi
+// CHECK: [[REG482:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)
+// CHECK-NEXT: store <2 x i64> [[REG482]], <2 x i64>* [[REG483:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG484:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG483]], align 16
+// CHECK-NEXT: [[REG485:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG484]], i32 0
+// CHECK-NEXT: [[REG486:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG487:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG488:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> [[REG486]], i64 [[REG485]], i32 0
+// CHECK-NEXT: store <2 x i64> [[REG488]], <2 x i64>* [[REG487]], align 16
+// CHECK-NEXT: [[REG489:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* [[REG487]], align 16
+// CHECK-NEXT: [[REG490:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG489]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[REG490]]
+
+// CHECK: define available_externally <4 x float> @_mm_loadr_ps
+// CHECK: [[REG491:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_ld(int, float vector[4] const*)
+// CHECK-NEXT: store <4 x float> [[REG491]], <4 x float>* [[REG492:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG493:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG492]], align 16
+// CHECK-NEXT: [[REG494:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG492]], align 16
+// CHECK-NEXT: [[REG495:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_perm(float vector[4], float vector[4], unsigned char vector[16])(<4 x float> [[REG493]], <4 x float> [[REG494]], <16 x i8> <i8 28, i8 29, i8 30, i8 31, i8 24, i8 25, i8 26, i8 27, i8 20, i8 21, i8 22, i8 23, i8 16, i8 17, i8 18, i8 19>)
+// CHECK-NEXT: store <4 x float> [[REG495]], <4 x float>* [[REG496:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG497:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG496]], align 16
+// CHECK-NEXT: ret <4 x float> [[REG497]]
+
+// CHECK: define available_externally <4 x float> @_mm_loadu_ps
+// CHECK: [[REG498:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_vsx_ld(int, float const*)
+// CHECK-NEXT: ret <4 x float> [[REG498]]
+
+void __attribute__((noinline))
+test_logic() {
+  res = _mm_or_ps(m1, m2);
+  res = _mm_and_ps(m1, m2);
+  res = _mm_andnot_ps(m1, m2);
+  res = _mm_xor_ps(m1, m2);
+}
+
+// CHECK-LABEL: @test_logic
+
+// CHECK: define available_externally <4 x float> @_mm_or_ps
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG499:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG500:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG501:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_or(float vector[4], float vector[4])(<4 x float> [[REG499]], <4 x float> [[REG500]])
+// CHECK-NEXT: ret <4 x float> [[REG501]]
+
+// CHECK: define available_externally <4 x float> @_mm_and_ps
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG502:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG503:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG504:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_and(float vector[4], float vector[4])(<4 x float> [[REG502]], <4 x float> [[REG503]])
+// CHECK-NEXT: ret <4 x float> [[REG504]]
+
+// CHECK: define available_externally <4 x float> @_mm_andnot_ps
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG505:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG506:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG507:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_andc(float vector[4], float vector[4])(<4 x float> [[REG505]], <4 x float> [[REG506]])
+// CHECK-NEXT: ret <4 x float> [[REG507]]
+
+// CHECK: define available_externally <4 x float> @_mm_xor_ps
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG508:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG509:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG510:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_xor(float vector[4], float vector[4])(<4 x float> [[REG508]], <4 x float> [[REG509]])
+// CHECK-NEXT: ret <4 x float> [[REG510]]
+
+void __attribute__((noinline))
+test_max() {
+  res = _mm_max_ps(m1, m2);
+  res = _mm_max_ss(m1, m2);
+  res64 = _mm_max_pi16(ms[0], ms[1]);
+  res64 = _mm_max_pu8(ms[0], ms[1]);
+}
+
+// CHECK-LABEL: @test_max
+
+// CHECK: define available_externally <4 x float> @_mm_max_ps(<4 x float> [[REG511:[0-9a-zA-Z_%.]+]], <4 x float> [[REG512:[0-9a-zA-Z_%.]+]])
+// CHECK: store <4 x float> [[REG511]], <4 x float>* [[REG513:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: store <4 x float> [[REG512]], <4 x float>* [[REG514:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG515:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG513]], align 16
+// CHECK-NEXT: [[REG516:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG514]], align 16
+// CHECK-NEXT: [[REG517:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpgt(float vector[4], float vector[4])(<4 x float> [[REG515]], <4 x float> [[REG516]])
+// CHECK-NEXT: store <4 x i32> [[REG517]], <4 x i32>* [[REG518:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG519:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG514]], align 16
+// CHECK-NEXT: [[REG520:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG513]], align 16
+// CHECK-NEXT: [[REG521:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG518]], align 16
+// CHECK-NEXT: [[REG522:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], bool vector[4])(<4 x float> [[REG519]], <4 x float> [[REG520]], <4 x i32> [[REG521]])
+// CHECK-NEXT: ret <4 x float> [[REG522]]
+
+// CHECK: define available_externally <4 x float> @_mm_max_ss
+// CHECK: [[REG523:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG523]], <4 x float>* [[REG524:[0-9a-zA-Z_%.]+]], align 16
+// CHECK: [[REG525:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG525]], <4 x float>* [[REG526:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG527:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG524]], align 16
+// CHECK-NEXT: [[REG528:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG526]], align 16
+// CHECK-NEXT: call <4 x float> @vec_max(float vector[4], float vector[4])(<4 x float> [[REG527]], <4 x float> [[REG528]])
+// CHECK: [[REG529:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG529]]
+
+// CHECK: define available_externally i64 @_mm_max_pi16
+// CHECK: [[REG530:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG531:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)(i64 [[REG530]])
+// CHECK-NEXT: [[REG532:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG531]] to <8 x i16>
+// CHECK-NEXT: store <8 x i16> [[REG532]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG533:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG534:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)(i64 [[REG533]])
+// CHECK-NEXT: [[REG535:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG534]] to <8 x i16>
+// CHECK-NEXT: store <8 x i16> [[REG535]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG536:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG537:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG538:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_cmpgt(short vector[8], short vector[8])(<8 x i16> [[REG536]], <8 x i16> [[REG537]])
+// CHECK-NEXT: store <8 x i16> [[REG538]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG539:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG540:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG541:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG542:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_sel(short vector[8], short vector[8], bool vector[8])(<8 x i16> [[REG539]], <8 x i16> [[REG540]], <8 x i16> [[REG541]])
+// CHECK-NEXT: store <8 x i16> [[REG542]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG543:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG544:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG543]] to <2 x i64>
+// CHECK-NEXT: [[REG545:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG544]], i32 0
+// CHECK-NEXT: ret i64 [[REG545]]
+
+// CHECK: define available_externally i64 @_mm_max_pu8
+// CHECK: [[REG546:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG547:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)(i64 [[REG546]])
+// CHECK-NEXT: [[REG548:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG547]] to <16 x i8>
+// CHECK-NEXT: store <16 x i8> [[REG548]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG549:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG550:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)(i64 [[REG549]])
+// CHECK-NEXT: [[REG551:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG550]] to <16 x i8>
+// CHECK-NEXT: store <16 x i8> [[REG551]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG552:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG553:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG554:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_cmpgt(unsigned char vector[16], unsigned char vector[16])(<16 x i8> [[REG552]], <16 x i8> [[REG553]])
+// CHECK-NEXT: store <16 x i8> [[REG554]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG555:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG556:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG557:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG558:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sel(unsigned char vector[16], unsigned char vector[16], bool vector[16])(<16 x i8> [[REG555]], <16 x i8> [[REG556]], <16 x i8> [[REG557]])
+// CHECK-NEXT: store <16 x i8> [[REG558]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG559:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG560:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG559]] to <2 x i64>
+// CHECK-NEXT: [[REG561:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG560]], i32 0
+// CHECK-NEXT: ret i64 [[REG561]]
+
+void __attribute__((noinline))
+test_alt_name_max() {
+  res64 = _m_pmaxsw(ms[0], ms[1]);
+  res64 = _m_pmaxub(ms[0], ms[1]);
+}
+
+// CHECK-LABEL: @test_alt_name_max
+
+// CHECK: define available_externally i64 @_m_pmaxsw
+// CHECK: [[REG562:[0-9a-zA-Z_%.]+]] = call i64 @_mm_max_pi16
+// CHECK-NEXT: ret i64 [[REG562]]
+
+// CHECK: define available_externally i64 @_m_pmaxub
+// CHECK: [[REG563:[0-9a-zA-Z_%.]+]] = call i64 @_mm_max_pu8
+// CHECK-NEXT: ret i64 [[REG563]]
+
+void __attribute__((noinline))
+test_min() {
+  res = _mm_min_ps(m1, m2);
+  res = _mm_min_ss(m1, m2);
+  res64 = _mm_min_pi16(ms[0], ms[1]);
+  res64 = _mm_min_pu8(ms[0], ms[1]);
+}
+
+// CHECK-LABEL: @test_min
+
+// CHECK: define available_externally <4 x float> @_mm_min_ps(<4 x float> [[REG517:[0-9a-zA-Z_%.]+]], <4 x float> [[REG518:[0-9a-zA-Z_%.]+]])
+// CHECK: store <4 x float> [[REG517]], <4 x float>* [[REG564:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: store <4 x float> [[REG518]], <4 x float>* [[REG565:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG566:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG565]], align 16
+// CHECK-NEXT: [[REG567:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG564]], align 16
+// CHECK-NEXT: [[REG568:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_cmpgt(float vector[4], float vector[4])(<4 x float> [[REG566]], <4 x float> [[REG567]])
+// CHECK-NEXT: store <4 x i32> [[REG568]], <4 x i32>* [[REG569:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG570:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG565]], align 16
+// CHECK-NEXT: [[REG571:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG564]], align 16
+// CHECK-NEXT: [[REG572:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* [[REG569]], align 16
+// CHECK-NEXT: [[REG573:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], bool vector[4])(<4 x float> [[REG570]], <4 x float> [[REG571]], <4 x i32> [[REG572]])
+// CHECK-NEXT: ret <4 x float> [[REG573]]
+
+// CHECK: define available_externally <4 x float> @_mm_min_ss
+// CHECK: [[REG574:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG574]], <4 x float>* [[REG575:[0-9a-zA-Z_%.]+]], align 16
+// CHECK: [[REG576:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, {{i32|i32 zeroext}} 0)
+// CHECK-NEXT: store <4 x float> [[REG576]], <4 x float>* [[REG577:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG578:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG575]], align 16
+// CHECK-NEXT: [[REG579:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG577]], align 16
+// CHECK-NEXT: call <4 x float> @vec_min(float vector[4], float vector[4])(<4 x float> [[REG578]], <4 x float> [[REG579]])
+// CHECK: [[REG580:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG580]]
+
+// CHECK: define available_externally i64 @_mm_min_pi16
+// CHECK: [[REG581:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG582:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)(i64 [[REG581]])
+// CHECK-NEXT: [[REG583:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG582]] to <8 x i16>
+// CHECK-NEXT: store <8 x i16> [[REG583]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG584:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG585:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)(i64 [[REG584]])
+// CHECK-NEXT: [[REG586:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG585]] to <8 x i16>
+// CHECK-NEXT: store <8 x i16> [[REG586]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG587:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG588:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG589:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_cmplt(short vector[8], short vector[8])(<8 x i16> [[REG587]], <8 x i16> [[REG588]])
+// CHECK-NEXT: store <8 x i16> [[REG589]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG590:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG591:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG592:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG593:[0-9a-zA-Z_%.]+]] = call <8 x i16> @vec_sel(short vector[8], short vector[8], bool vector[8])(<8 x i16> [[REG590]], <8 x i16> [[REG591]], <8 x i16> [[REG592]])
+// CHECK-NEXT: store <8 x i16> [[REG593]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG594:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG595:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG594]] to <2 x i64>
+// CHECK-NEXT: [[REG596:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG595]], i32 0
+// CHECK-NEXT: ret i64 [[REG596]]
+
+// CHECK: define available_externally i64 @_mm_min_pu8
+// CHECK: [[REG597:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG598:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)(i64 [[REG597]])
+// CHECK-NEXT: [[REG599:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG598]] to <16 x i8>
+// CHECK-NEXT: store <16 x i8> [[REG599]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG600:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG601:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)(i64 [[REG600]])
+// CHECK-NEXT: [[REG602:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG601]] to <16 x i8>
+// CHECK-NEXT: store <16 x i8> [[REG602]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG603:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG604:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG605:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_cmplt(unsigned char vector[16], unsigned char vector[16])(<16 x i8> [[REG603]], <16 x i8> [[REG604]])
+// CHECK-NEXT: store <16 x i8> [[REG605]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG606:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG607:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG608:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG609:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sel(unsigned char vector[16], unsigned char vector[16], bool vector[16])(<16 x i8> [[REG606]], <16 x i8> [[REG607]], <16 x i8> [[REG608]])
+// CHECK-NEXT: store <16 x i8> [[REG609]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG610:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG611:[0-9a-zA-Z_%.]+]] = bitcast <16 x i8> [[REG610]] to <2 x i64>
+// CHECK-NEXT: [[REG612:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG611]], i32 0
+// CHECK-NEXT: ret i64 [[REG612]]
+
+void __attribute__((noinline))
+test_alt_name_min() {
+  res64 = _m_pminsw(ms[0], ms[1]);
+  res64 = _m_pminub(ms[0], ms[1]);
+}
+
+// CHECK-LABEL: @test_alt_name_min
+
+// CHECK: define available_externally i64 @_m_pminsw
+// CHECK: [[REG613:[0-9a-zA-Z_%.]+]] = call i64 @_mm_min_pi16
+// CHECK-NEXT: ret i64 [[REG613]]
+
+// CHECK: define available_externally i64 @_m_pminub
+// CHECK: [[REG614:[0-9a-zA-Z_%.]+]] = call i64 @_mm_min_pu8
+// CHECK-NEXT: ret i64 [[REG614]]
+
+void __attribute__((noinline))
+test_move() {
+  _mm_maskmove_si64(ms[0], ms[1], (char *)&res64);
+  res = _mm_move_ss(m1, m2);
+  res = _mm_movehl_ps(m1, m2);
+  res = _mm_movelh_ps(m1, m2);
+  i = _mm_movemask_pi8(ms[0]);
+  i = _mm_movemask_ps(m1);
+}
+
+// CHECK-LABEL: @test_move
+
+// CHECK: define available_externally void @_mm_maskmove_si64
+// CHECK: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store i8* {{[0-9a-zA-Z_%.]+}}, i8** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store i64 -9187201950435737472, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG615:[0-9a-zA-Z_%.]+]] = load i8*, i8** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG616:[0-9a-zA-Z_%.]+]] = bitcast i8* [[REG615]] to i64*
+// CHECK-NEXT: store i64* [[REG616]], i64** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG617:[0-9a-zA-Z_%.]+]] = load i64*, i64** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG618:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG617]], align 8
+// CHECK-NEXT: store i64 [[REG618]], i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG619:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG620:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG621:[0-9a-zA-Z_%.]+]] = and i64 [[REG619]], [[REG620]]
+// CHECK-NEXT: [[REG622:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG623:[0-9a-zA-Z_%.]+]] = call i64 @_mm_cmpeq_pi8(i64 [[REG621]], i64 [[REG622]])
+// CHECK-NEXT: store i64 [[REG623]], i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG624:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG625:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG626:[0-9a-zA-Z_%.]+]] = xor i64 [[REG625]], -1
+// CHECK-NEXT: [[REG627:[0-9a-zA-Z_%.]+]] = and i64 [[REG624]], [[REG626]]
+// CHECK-NEXT: [[REG628:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG629:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG630:[0-9a-zA-Z_%.]+]] = and i64 [[REG628]], [[REG629]]
+// CHECK-NEXT: [[REG631:[0-9a-zA-Z_%.]+]] = or i64 [[REG627]], [[REG630]]
+// CHECK-NEXT: store i64 [[REG631]], i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG632:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG633:[0-9a-zA-Z_%.]+]] = load i64*, i64** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store i64 [[REG632]], i64* [[REG633]], align 8
+// CHECK-NEXT: ret void
+
+// CHECK: define available_externally <4 x float> @_mm_move_ss
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG634:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG635:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG636:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> [[REG634]], <4 x float> [[REG635]], <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG636]]
+
+// CHECK: define available_externally <4 x float> @_mm_movehl_ps
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG637:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG638:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG637]] to <2 x i64>
+// CHECK-NEXT: [[REG639:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG640:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG639]] to <2 x i64>
+// CHECK-NEXT: [[REG641:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_mergel(unsigned long long vector[2], unsigned long long vector[2])(<2 x i64> [[REG638]], <2 x i64> [[REG640]])
+// CHECK-NEXT: [[REG642:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG641]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[REG642]]
+
+// CHECK: define available_externally <4 x float> @_mm_movelh_ps
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG643:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG644:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG643]] to <2 x i64>
+// CHECK-NEXT: [[REG645:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG646:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG645]] to <2 x i64>
+// CHECK-NEXT: [[REG647:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_mergeh(unsigned long long vector[2], unsigned long long vector[2])(<2 x i64> [[REG644]], <2 x i64> [[REG646]])
+// CHECK-NEXT: [[REG648:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG647]] to <4 x float>
+// CHECK-NEXT: ret <4 x float> [[REG648]]
+
+// CHECK: define available_externally signext i32 @_mm_movemask_pi8
+// CHECK: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-LE-NEXT: store i64 2269495618449464, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-BE-NEXT: store i64 4048780183313844224, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG649:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG650:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG651:[0-9a-zA-Z_%.]+]] = call i64 @llvm.ppc.bpermd(i64 [[REG649]], i64 [[REG650]])
+// CHECK-NEXT: [[REG652:[0-9a-zA-Z_%.]+]] = trunc i64 [[REG651]] to i32
+// CHECK-NEXT: ret i32 [[REG652]]
+
+// CHECK: define available_externally signext i32 @_mm_movemask_ps
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG653:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG654:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG653]] to <16 x i8>
+// CHECK-LE-NEXT: [[REG655:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> [[REG654]], <16 x i8> bitcast (<4 x i32> <i32 2113632, i32 -2139062144, i32 -2139062144, i32 -2139062144> to <16 x i8>))
+// CHECK-BE-NEXT: [[REG655:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> [[REG654]], <16 x i8> bitcast (<4 x i32> <i32 -2139062144, i32 -2139062144, i32 -2139062144, i32 2113632> to <16 x i8>))
+// CHECK-NEXT: store <2 x i64> [[REG655]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG656:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-LE-NEXT: [[REG657:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG656]], i32 1
+// CHECK-BE-NEXT: [[REG657:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG656]], i32 0
+// CHECK-NEXT: [[REG658:[0-9a-zA-Z_%.]+]] = trunc i64 [[REG657]] to i32
+// CHECK-NEXT: ret i32 [[REG658]]
+
+void __attribute__((noinline))
+test_alt_name_move() {
+  i = _m_pmovmskb(ms[0]);
+  _m_maskmovq(ms[0], ms[1], (char *)&res64);
+}
+
+// CHECK-LABEL: @test_alt_name_move
+
+// CHECK: define available_externally signext i32 @_m_pmovmskb
+// CHECK: [[REG659:[0-9a-zA-Z_%.]+]] = call signext i32 @_mm_movemask_pi8
+// CHECK-NEXT: ret i32 [[REG659]]
+
+// CHECK: define available_externally void @_m_maskmovq
+// CHECK: call void @_mm_maskmove_si64
+// CHECK-NEXT: ret void
+
+void __attribute__((noinline))
+test_mul() {
+  res = _mm_mul_ps(m1, m2);
+  res = _mm_mul_ss(m1, m2);
+  res64 = _mm_mulhi_pu16(ms[0], ms[1]);
+  res64 = _m_pmulhuw(ms[0], ms[1]);
+}
+
+// CHECK-LABEL: @test_mul
+
+// CHECK: define available_externally <4 x float> @_mm_mul_ps(<4 x float> [[REG660:[0-9a-zA-Z_%.]+]], <4 x float> [[REG661:[0-9a-zA-Z_%.]+]])
+// CHECK: store <4 x float> [[REG660]], <4 x float>* [[REG662:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: store <4 x float> [[REG661]], <4 x float>* [[REG663:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG664:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG662]], align 16
+// CHECK-NEXT: [[REG665:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG663]], align 16
+// CHECK-NEXT: [[REG666:[0-9a-zA-Z_%.]+]] = fmul <4 x float> [[REG664]], [[REG665]]
+// CHECK-NEXT: ret <4 x float> [[REG666]]
+
+// CHECK: define available_externally <4 x float> @_mm_mul_ss(<4 x float> [[REG667:[0-9a-zA-Z_%.]+]], <4 x float> [[REG668:[0-9a-zA-Z_%.]+]])
+// CHECK: store <4 x float> [[REG667]], <4 x float>* [[REG669:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: store <4 x float> [[REG668]], <4 x float>* [[REG670:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG671:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG669]], align 16
+// CHECK-NEXT: [[REG672:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> [[REG671]], i32 zeroext 0)
+// CHECK-NEXT: store <4 x float> [[REG672]], <4 x float>* [[REG673:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG674:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG670]], align 16
+// CHECK-NEXT: [[REG675:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> [[REG674]], i32 zeroext 0)
+// CHECK-NEXT: store <4 x float> [[REG675]], <4 x float>* [[REG676:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG677:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG673]], align 16
+// CHECK-NEXT: [[REG678:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG676]], align 16
+// CHECK-NEXT: [[REG679:[0-9a-zA-Z_%.]+]] = fmul <4 x float> [[REG677]], [[REG678]]
+// CHECK-NEXT: store <4 x float> [[REG679]], <4 x float>* [[REG680:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG681:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG669]], align 16
+// CHECK-NEXT: [[REG682:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG680]], align 16
+// CHECK-NEXT: [[REG683:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> [[REG681]], <4 x float> [[REG682]], <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG683]]
+
+// CHECK: define available_externally i64 @_mm_mulhi_pu16(i64 [[REG684:[0-9a-zA-Z_%.]+]], i64 [[REG685:[0-9a-zA-Z_%.]+]])
+// CHECK: store i64 [[REG684]], i64* [[REG686:[0-9a-zA-Z_%.]+]], align 8
+// CHECK-NEXT: store i64 [[REG685]], i64* [[REG687:[0-9a-zA-Z_%.]+]], align 8
+// CHECK-LE-NEXT: store <16 x i8> <i8 2, i8 3, i8 18, i8 19, i8 6, i8 7, i8 22, i8 23, i8 10, i8 11, i8 26, i8 27, i8 14, i8 15, i8 30, i8 31>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-BE-NEXT: store <16 x i8> <i8 0, i8 1, i8 16, i8 17, i8 4, i8 5, i8 20, i8 21, i8 0, i8 1, i8 16, i8 17, i8 4, i8 5, i8 20, i8 21>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG688:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG686]], align 8
+// CHECK-NEXT: [[REG689:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)(i64 [[REG688]])
+// CHECK-NEXT: [[REG690:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG689]] to <8 x i16>
+// CHECK-NEXT: store <8 x i16> [[REG690]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG691:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG687]], align 8
+// CHECK-NEXT: [[REG692:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)(i64 [[REG691]])
+// CHECK-NEXT: [[REG693:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG692]] to <8 x i16>
+// CHECK-NEXT: store <8 x i16> [[REG693]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG694:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG695:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG696:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_vmuleuh(<8 x i16> [[REG694]], <8 x i16> [[REG695]])
+// CHECK-NEXT: store <4 x i32> [[REG696]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG697:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG698:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG699:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_vmulouh(<8 x i16> [[REG697]], <8 x i16> [[REG698]])
+// CHECK-NEXT: store <4 x i32> [[REG699]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG700:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG701:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG702:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG703:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_perm(unsigned int vector[4], unsigned int vector[4], unsigned char vector[16])(<4 x i32> [[REG700]], <4 x i32> [[REG701]], <16 x i8> [[REG702]])
+// CHECK-NEXT: [[REG704:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG703]] to <8 x i16>
+// CHECK-NEXT: store <8 x i16> [[REG704]], <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG705:[0-9a-zA-Z_%.]+]] = load <8 x i16>, <8 x i16>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG706:[0-9a-zA-Z_%.]+]] = bitcast <8 x i16> [[REG705]] to <2 x i64>
+// CHECK-NEXT: [[REG707:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG706]], i32 0
+// CHECK-NEXT: ret i64 [[REG707]]
+
+// CHECK: define available_externally i64 @_m_pmulhuw(i64 [[REG708:[0-9a-zA-Z_%.]+]], i64 [[REG709:[0-9a-zA-Z_%.]+]])
+// CHECK: store i64 [[REG708]], i64* [[REG710:[0-9a-zA-Z_%.]+]], align 8
+// CHECK-NEXT: store i64 [[REG709]], i64* [[REG711:[0-9a-zA-Z_%.]+]], align 8
+// CHECK-NEXT: [[REG712:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG710]], align 8
+// CHECK-NEXT: [[REG713:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG711]], align 8
+// CHECK-NEXT: [[REG714:[0-9a-zA-Z_%.]+]] = call i64 @_mm_mulhi_pu16(i64 [[REG712]], i64 [[REG713]])
+// CHECK-NEXT: ret i64 [[REG714]]
+
+void __attribute__((noinline))
+test_prefetch() {
+  _mm_prefetch(ms, i);
+}
+
+// CHECK-LABEL: @test_prefetch
+
+// CHECK: define available_externally void @_mm_prefetch
+// CHECK: store i8* {{[0-9a-zA-Z_%.]+}}, i8** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store i32 {{[0-9a-zA-Z_%.]+}}, i32* {{[0-9a-zA-Z_%.]+}}, align 4
+// CHECK-NEXT: [[REG715:[0-9a-zA-Z_%.]+]] = load i8*, i8** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: call void @llvm.prefetch(i8* [[REG715]], i32 0, i32 3, i32 1)
+// CHECK-NEXT: ret void
+
+void __attribute__((noinline))
+test_rcp() {
+  res = _mm_rcp_ps(m1);
+  res = _mm_rcp_ss(m1);
+}
+
+// CHECK-LABEL: @test_rcp
+
+// CHECK: define available_externally <4 x float> @_mm_rcp_ps
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG716:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG717:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_re(float vector[4])(<4 x float> [[REG716]])
+// CHECK-NEXT: ret <4 x float> [[REG717]]
+
+// CHECK: define available_externally <4 x float> @_mm_rcp_ss
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG718:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG719:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> [[REG718]], i32 zeroext 0)
+// CHECK-NEXT: store <4 x float> [[REG719]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG720:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG721:[0-9a-zA-Z_%.]+]] = call <4 x float> @_mm_rcp_ps(<4 x float> [[REG720]])
+// CHECK-NEXT: store <4 x float> [[REG721]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG722:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG723:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG724:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> [[REG722]], <4 x float> [[REG723]], <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG724]]
+
+void __attribute__((noinline))
+test_rsqrt() {
+  res = _mm_rsqrt_ps(m1);
+  res = _mm_rsqrt_ss(m1);
+}
+
+// CHECK-LABEL: @test_rsqrt
+
+// CHECK: define available_externally <4 x float> @_mm_rsqrt_ps
+// CHECK: [[REG725:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_rsqrte(float vector[4])(<4 x float> {{[0-9a-zA-Z_%.]+}})
+// CHECK-NEXT: ret <4 x float> [[REG725]]
+
+// CHECK: define available_externally <4 x float> @_mm_rsqrt_ss
+// CHECK: [[REG726:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> {{[0-9a-zA-Z_%.]+}}, i32 zeroext 0)
+// CHECK-NEXT: store <4 x float> [[REG726]], <4 x float>* [[REG727:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG728:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG727]], align 16
+// CHECK-NEXT: [[REG729:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_rsqrte(float vector[4])(<4 x float> [[REG728]])
+// CHECK-NEXT: store <4 x float> [[REG729]], <4 x float>* [[REG730:[0-9a-zA_Z_%.]+]], align 16
+// CHECK-NEXT: [[REG731:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG732:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG730]], align 16
+// CHECK-NEXT: [[REG733:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> [[REG731]], <4 x float> [[REG732]], <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG733]]
+
+void __attribute__((noinline))
+test_sad() {
+  res64 = _mm_sad_pu8(ms[0], ms[1]);
+  res64 = _m_psadbw(ms[0], ms[1]);
+}
+
+// CHECK-LABEL: @test_sad
+
+// CHECK: define available_externally i64 @_mm_sad_pu8(i64 [[REG734:[0-9a-zA-Z_%.]+]], i64 [[REG735:[0-9a-zA-Z_%.]+]])
+// CHECK: store i64 [[REG734]], i64* [[REG736:[0-9a-zA-Z_%.]+]], align 8
+// CHECK-NEXT: store i64 [[REG735]], i64* [[REG737:[0-9a-zA-Z_%.]+]], align 8
+// CHECK-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG738:[0-9a-zA-Z_%.]+]] = bitcast {{[0-9a-zA-Z_%.]+}}* {{[0-9a-zA-Z_%.]+}} to i8*
+// CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 8 [[REG738]], i8 0, i64 8, i1 false)
+// CHECK-NEXT: [[REG739:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG736]], align 8
+// CHECK-NEXT: [[REG740:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> <i64 0, i64 undef>, i64 [[REG739]], i32 1
+// CHECK-NEXT: store <2 x i64> [[REG740]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG741:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG742:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG741]] to <16 x i8>
+// CHECK-NEXT: store <16 x i8> [[REG742]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG743:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG737]], align 8
+// CHECK-NEXT: [[REG744:[0-9a-zA-Z_%.]+]] = insertelement <2 x i64> <i64 0, i64 undef>, i64 [[REG743]], i32 1
+// CHECK-NEXT: store <2 x i64> [[REG744]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG745:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG746:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG745]] to <16 x i8>
+// CHECK-NEXT: store <16 x i8> [[REG746]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG747:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG748:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG749:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_min(unsigned char vector[16], unsigned char vector[16])(<16 x i8> [[REG747]], <16 x i8> [[REG748]])
+// CHECK-NEXT: store <16 x i8> [[REG749]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG750:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG751:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG752:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_max(unsigned char vector[16], unsigned char vector[16])(<16 x i8> [[REG750]], <16 x i8> [[REG751]])
+// CHECK-NEXT: store <16 x i8> [[REG752]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG753:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG754:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG755:[0-9a-zA-Z_%.]+]] = call <16 x i8> @vec_sub(unsigned char vector[16], unsigned char vector[16])(<16 x i8> [[REG753]], <16 x i8> [[REG754]])
+// CHECK-NEXT: store <16 x i8> [[REG755]], <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG756:[0-9a-zA-Z_%.]+]] = load <16 x i8>, <16 x i8>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG757:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sum4s(unsigned char vector[16], unsigned int vector[4])(<16 x i8> [[REG756]], <4 x i32> zeroinitializer)
+// CHECK-NEXT: store <4 x i32> [[REG757]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG758:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG759:[0-9a-zA-Z_%.]+]] = call <4 x i32> @vec_sums(<4 x i32> [[REG758]], <4 x i32> zeroinitializer)
+// CHECK-NEXT: store <4 x i32> [[REG759]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG760:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG761:[0-9a-zA-Z_%.]+]] = extractelement <4 x i32> [[REG760]], i32 3
+// CHECK-NEXT: [[REG762:[0-9a-zA-Z_%.]+]] = trunc i32 [[REG761]] to i16
+// CHECK-NEXT: [[REG763:[0-9a-zA-Z_%.]+]] = bitcast {{[0-9a-zA-Z_%.]+}}* {{[0-9a-zA-Z_%.]+}} to [4 x i16]*
+// CHECK-NEXT: [[REG764:[0-9a-zA-Z_%.]+]] = getelementptr inbounds [4 x i16], [4 x i16]* [[REG763]], i64 0, i64 0
+// CHECK-NEXT: store i16 [[REG762]], i16* [[REG764]], align 8
+// CHECK-NEXT: [[REG765:[0-9a-zA-Z_%.]+]] = bitcast {{[0-9a-zA-Z_%.]+}}* {{[0-9a-zA-Z_%.]+}} to i64*
+// CHECK-NEXT: [[REG766:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG765]], align 8
+// CHECK-NEXT: ret i64 [[REG766]]
+
+// CHECK: define available_externally i64 @_m_psadbw
+// CHECK: [[REG767:[0-9a-zA-Z_%.]+]] = call i64 @_mm_sad_pu8
+// CHECK-NEXT: ret i64 [[REG767]]
+
+void __attribute__((noinline))
+test_set() {
+  res = _mm_set_ps(fs[0], fs[1], fs[2], fs[3]);
+  res = _mm_set_ps1(fs[0]);
+  res = _mm_set_ss(fs[0]);
+  res = _mm_set1_ps(fs[0]);
+  res = _mm_setr_ps(fs[0], fs[1], fs[2], fs[3]);
+}
+
+// CHECK-LABEL: @test_set
+
+// CHECK: define available_externally <4 x float> @_mm_set_ps(float [[REG768:[0-9a-zA-Z_%.]+]], float [[REG769:[0-9a-zA-Z_%.]+]], float [[REG770:[0-9a-zA-Z_%.]+]], float [[REG771:[0-9a-zA-Z_%.]+]])
+// CHECK: store float [[REG768]], float* [[REG772:[0-9a-zA-Z_%.]+]], align 4
+// CHECK-NEXT: store float [[REG769]], float* [[REG773:[0-9a-zA-Z_%.]+]], align 4
+// CHECK-NEXT: store float [[REG770]], float* [[REG774:[0-9a-zA-Z_%.]+]], align 4
+// CHECK-NEXT: store float [[REG771]], float* [[REG775:[0-9a-zA-Z_%.]+]], align 4
+// CHECK-NEXT: [[REG776:[0-9a-zA-Z_%.]+]] = load float, float* [[REG775]], align 4
+// CHECK-NEXT: [[REG777:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> undef, float [[REG776]], i32 0
+// CHECK-NEXT: [[REG778:[0-9a-zA-Z_%.]+]] = load float, float* [[REG774]], align 4
+// CHECK-NEXT: [[REG779:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> [[REG777]], float [[REG778]], i32 1
+// CHECK-NEXT: [[REG780:[0-9a-zA-Z_%.]+]] = load float, float* [[REG773]], align 4
+// CHECK-NEXT: [[REG781:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> [[REG779]], float [[REG780]], i32 2
+// CHECK-NEXT: [[REG782:[0-9a-zA-Z_%.]+]] = load float, float* [[REG772]], align 4
+// CHECK-NEXT: [[REG783:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> [[REG781]], float [[REG782]], i32 3
+// CHECK-NEXT: store <4 x float> [[REG783]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG784:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: ret <4 x float> [[REG784]]
+
+// CHECK: define available_externally <4 x float> @_mm_set_ps1(float [[REG785:[0-9a-zA-Z_%.]+]])
+// CHECK: store float [[REG785]], float* [[REG786:[0-9a-zA-Z_%.]+]], align 4
+// CHECK-NEXT: [[REG787:[0-9a-zA-Z_%.]+]] = load float, float* [[REG786]], align 4
+// CHECK-NEXT: [[REG788:[0-9a-zA-Z_%.]+]] = call <4 x float> @_mm_set1_ps(float [[REG787]])
+// CHECK-NEXT: ret <4 x float> [[REG788]]
+
+// CHECK: define available_externally <4 x float> @_mm_set_ss(float [[REG789:[0-9a-zA-Z_%.]+]])
+// CHECK: store float [[REG789:[0-9a-zA-Z_%.]+]], float* [[REG790:[0-9a-zA-Z_%.]+]], align 4
+// CHECK-NEXT: [[REG791:[0-9a-zA-Z_%.]+]] = load float, float* [[REG790]], align 4
+// CHECK-NEXT: [[REG792:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> undef, float [[REG791]], i32 0
+// CHECK-NEXT: [[REG793:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> [[REG792]], float 0.000000e+00, i32 1
+// CHECK-NEXT: [[REG794:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> [[REG793]], float 0.000000e+00, i32 2
+// CHECK-NEXT: [[REG795:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> [[REG794]], float 0.000000e+00, i32 3
+// CHECK-NEXT: store <4 x float> [[REG795]], <4 x float>* [[REG796:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG797:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG796]], align 16
+// CHECK-NEXT: ret <4 x float> [[REG797]]
+
+// CHECK: define available_externally <4 x float> @_mm_set1_ps(float [[REG798:[0-9a-zA-Z_%.]+]])
+// CHECK: store float [[REG798]], float* [[REG799:[0-9a-zA-Z_%.]+]], align 4
+// CHECK-NEXT: [[REG800:[0-9a-zA-Z_%.]+]] = load float, float* [[REG799]], align 4
+// CHECK-NEXT: [[REG801:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> undef, float [[REG800]], i32 0
+// CHECK-NEXT: [[REG802:[0-9a-zA-Z_%.]+]] = load float, float* [[REG799]], align 4
+// CHECK-NEXT: [[REG803:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> [[REG801]], float [[REG802]], i32 1
+// CHECK-NEXT: [[REG804:[0-9a-zA-Z_%.]+]] = load float, float* [[REG799]], align 4
+// CHECK-NEXT: [[REG805:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> [[REG803]], float [[REG804]], i32 2
+// CHECK-NEXT: [[REG806:[0-9a-zA-Z_%.]+]] = load float, float* [[REG799]], align 4
+// CHECK-NEXT: [[REG807:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> [[REG805]], float [[REG806]], i32 3
+// CHECK-NEXT: store <4 x float> [[REG807]], <4 x float>* [[REG808:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG809:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG808]], align 16
+// CHECK-NEXT: ret <4 x float> [[REG809]]
+
+// CHECK: define available_externally <4 x float> @_mm_setr_ps(float [[REG810:[0-9a-zA-Z_%.]+]], float [[REG811:[0-9a-zA-Z_%.]+]], float [[REG812:[0-9a-zA-Z_%.]+]], float [[REG813:[0-9a-zA-Z_%.]+]])
+// CHECK: store float [[REG810]], float* [[REG814:[0-9a-zA-Z_%.]+]], align 4
+// CHECK-NEXT: store float [[REG811]], float* [[REG815:[0-9a-zA-Z_%.]+]], align 4
+// CHECK-NEXT: store float [[REG812]], float* [[REG816:[0-9a-zA-Z_%.]+]], align 4
+// CHECK-NEXT: store float [[REG813]], float* [[REG817:[0-9a-zA-Z_%.]+]], align 4
+// CHECK-NEXT: [[REG818:[0-9a-zA-Z_%.]+]] = load float, float* [[REG814]], align 4
+// CHECK-NEXT: [[REG819:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> undef, float [[REG818]], i32 0
+// CHECK-NEXT: [[REG820:[0-9a-zA-Z_%.]+]] = load float, float* [[REG815]], align 4
+// CHECK-NEXT: [[REG821:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> [[REG819]], float [[REG820]], i32 1
+// CHECK-NEXT: [[REG822:[0-9a-zA-Z_%.]+]] = load float, float* [[REG816]], align 4
+// CHECK-NEXT: [[REG823:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> [[REG821]], float [[REG822]], i32 2
+// CHECK-NEXT: [[REG824:[0-9a-zA-Z_%.]+]] = load float, float* [[REG817]], align 4
+// CHECK-NEXT: [[REG825:[0-9a-zA-Z_%.]+]] = insertelement <4 x float> [[REG823]], float [[REG824]], i32 3
+// CHECK-NEXT: store <4 x float> [[REG825]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG826:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: ret <4 x float> [[REG826]]
+
+void __attribute__((noinline))
+test_setzero() {
+  res = _mm_setzero_ps();
+}
+
+// CHECK-LABEL: @test_setzero
+
+// CHECK: define available_externally <4 x float> @_mm_setzero_ps
+// CHECK: store <4 x float> zeroinitializer, <4 x float>* [[REG827:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG828:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG827]], align 16
+// CHECK-NEXT: ret <4 x float> [[REG828]]
+
+void __attribute__((noinline))
+test_sfence() {
+  _mm_sfence();
+}
+
+// CHECK-LABEL: @test_sfence
+
+// CHECK: define available_externally void @_mm_sfence
+// CHECK: fence release
+// CHECK-NEXT: ret void
+
+void __attribute__((noinline))
+test_shuffle() {
+  res64 = _mm_shuffle_pi16(ms[0], i);
+  res = _mm_shuffle_ps(m1, m2, i);
+  res64 = _m_pshufw(ms[0], i);
+}
+
+// CHECK-LABEL: @test_shuffle
+
+// CHECK: define available_externally i64 @_mm_shuffle_pi16(i64 [[REG829:[0-9a-zA-Z_%.]+]], i32 signext [[REG830:[0-9a-zA-Z_%.]+]])
+// CHECK: store i64 [[REG829]], i64* [[REG831:[0-9a-zA-Z_%.]+]], align 8
+// CHECK-NEXT: store i32 [[REG830]], i32* [[REG832:[0-9a-zA-Z_%.]+]], align 4
+// CHECK-NEXT: [[REG833:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG832]], align 4
+// CHECK-NEXT: [[REG834:[0-9a-zA-Z_%.]+]] = and i32 [[REG833]], 3
+// CHECK-NEXT: [[REG835:[0-9a-zA-Z_%.]+]] = sext i32 [[REG834]] to i64
+// CHECK-NEXT: store i64 [[REG835]], i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG836:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG832]], align 4
+// CHECK-NEXT: [[REG837:[0-9a-zA-Z_%.]+]] = ashr i32 [[REG836]], 2
+// CHECK-NEXT: [[REG838:[0-9a-zA-Z_%.]+]] = and i32 [[REG837]], 3
+// CHECK-NEXT: [[REG839:[0-9a-zA-Z_%.]+]] = sext i32 [[REG838]] to i64
+// CHECK-NEXT: store i64 [[REG839]], i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG840:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG832]], align 4
+// CHECK-NEXT: [[REG841:[0-9a-zA-Z_%.]+]] = ashr i32 [[REG840]], 4
+// CHECK-NEXT: [[REG842:[0-9a-zA-Z_%.]+]] = and i32 [[REG841]], 3
+// CHECK-NEXT: [[REG843:[0-9a-zA-Z_%.]+]] = sext i32 [[REG842]] to i64
+// CHECK-NEXT: store i64 [[REG843]], i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG844:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG832]], align 4
+// CHECK-NEXT: [[REG845:[0-9a-zA-Z_%.]+]] = ashr i32 [[REG844]], 6
+// CHECK-NEXT: [[REG846:[0-9a-zA-Z_%.]+]] = and i32 [[REG845]], 3
+// CHECK-NEXT: [[REG847:[0-9a-zA-Z_%.]+]] = sext i32 [[REG846]] to i64
+// CHECK-NEXT: store i64 [[REG847]], i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG848:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG849:[0-9a-zA-Z_%.]+]] = getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.permute_selectors, i64 0, i64 [[REG848]]
+// CHECK-NEXT: [[REG850:[0-9a-zA-Z_%.]+]] = load i16, i16* [[REG849]], align 2
+// CHECK-NEXT: [[REG851:[0-9a-zA-Z_%.]+]] = bitcast {{[0-9a-zA-Z_%.]+}}* {{[0-9a-zA-Z_%.]+}} to [4 x i16]*
+// CHECK-LE-NEXT: [[REG852:[0-9a-zA-Z_%.]+]] = getelementptr inbounds [4 x i16], [4 x i16]* [[REG851]], i64 0, i64 0
+// CHECK-BE-NEXT: [[REG852:[0-9a-zA-Z_%.]+]] = getelementptr inbounds [4 x i16], [4 x i16]* [[REG851]], i64 0, i64 3
+// CHECK-NEXT: store i16 [[REG850]], i16* [[REG852]]
+// CHECK-NEXT: [[REG853:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG854:[0-9a-zA-Z_%.]+]] = getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.permute_selectors, i64 0, i64 [[REG853]]
+// CHECK-NEXT: [[REG855:[0-9a-zA-Z_%.]+]] = load i16, i16* [[REG854]], align 2
+// CHECK-NEXT: [[REG856:[0-9a-zA-Z_%.]+]] = bitcast {{[0-9a-zA-Z_%.]+}}* {{[0-9a-zA-Z_%.]+}} to [4 x i16]*
+// CHECK-LE-NEXT: [[REG857:[0-9a-zA-Z_%.]+]] = getelementptr inbounds [4 x i16], [4 x i16]* [[REG856]], i64 0, i64 1
+// CHECK-BE-NEXT: [[REG857:[0-9a-zA-Z_%.]+]] = getelementptr inbounds [4 x i16], [4 x i16]* [[REG856]], i64 0, i64 2
+// CHECK-NEXT: store i16 [[REG855]], i16* [[REG857]]
+// CHECK-NEXT: [[REG858:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG859:[0-9a-zA-Z_%.]+]] = getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.permute_selectors, i64 0, i64 [[REG858]]
+// CHECK-NEXT: [[REG860:[0-9a-zA-Z_%.]+]] = load i16, i16* [[REG859]], align 2
+// CHECK-NEXT: [[REG861:[0-9a-zA-Z_%.]+]] = bitcast {{[0-9a-zA-Z_%.]+}}* {{[0-9a-zA-Z_%.]+}} to [4 x i16]*
+// CHECK-LE-NEXT: [[REG862:[0-9a-zA-Z_%.]+]] = getelementptr inbounds [4 x i16], [4 x i16]* [[REG861]], i64 0, i64 2
+// CHECK-BE-NEXT: [[REG862:[0-9a-zA-Z_%.]+]] = getelementptr inbounds [4 x i16], [4 x i16]* [[REG861]], i64 0, i64 1
+// CHECK-NEXT: store i16 [[REG860]], i16* [[REG862]]
+// CHECK-NEXT: [[REG863:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG864:[0-9a-zA-Z_%.]+]] = getelementptr inbounds [4 x i16], [4 x i16]* @_mm_shuffle_pi16.permute_selectors, i64 0, i64 [[REG863]]
+// CHECK-NEXT: [[REG865:[0-9a-zA-Z_%.]+]] = load i16, i16* [[REG864]], align 2
+// CHECK-NEXT: [[REG866:[0-9a-zA-Z_%.]+]] = bitcast {{[0-9a-zA-Z_%.]+}}* {{[0-9a-zA-Z_%.]+}} to [4 x i16]*
+// CHECK-LE-NEXT: [[REG867:[0-9a-zA-Z_%.]+]] = getelementptr inbounds [4 x i16], [4 x i16]* [[REG866]], i64 0, i64 3
+// CHECK-BE-NEXT: [[REG867:[0-9a-zA-Z_%.]+]] = getelementptr inbounds [4 x i16], [4 x i16]* [[REG866]], i64 0, i64 0
+// CHECK-NEXT: store i16 [[REG865]], i16* [[REG867]]
+// CHECK-NEXT: [[REG868:[0-9a-zA-Z_%.]+]] = bitcast {{[0-9a-zA-Z_%.]+}}* {{[0-9a-zA-Z_%.]+}} to i64*
+// CHECK-NEXT: [[REG869:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG868]], align 8
+// CHECK-NEXT: [[REG870:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)(i64 [[REG869]])
+// CHECK-NEXT: store <2 x i64> [[REG870]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG871:[0-9a-zA-Z_%.]+]] = load i64, i64* [[REG831]], align 8
+// CHECK-NEXT: [[REG872:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_splats(unsigned long long)(i64 [[REG871]])
+// CHECK-NEXT: store <2 x i64> [[REG872]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG873:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG874:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG875:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG876:[0-9a-zA-Z_%.]+]] = bitcast <2 x i64> [[REG875]] to <16 x i8>
+// CHECK-NEXT: [[REG877:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_perm(unsigned long long vector[2], unsigned long long vector[2], unsigned char vector[16])(<2 x i64> [[REG873]], <2 x i64> [[REG874]], <16 x i8> [[REG876]])
+// CHECK-NEXT: store <2 x i64> [[REG877]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG878:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG879:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG878]], i32 0
+// CHECK-NEXT: ret i64 [[REG879]]
+
+// CHECK: define available_externally <4 x float> @_mm_shuffle_ps(<4 x float> [[REG880:[0-9a-zA-Z_%.]+]], <4 x float> [[REG881:[0-9a-zA-Z_%.]+]], i32 signext [[REG882:[0-9a-zA-Z_%.]+]])
+// CHECK: store <4 x float> [[REG880]], <4 x float>* [[REG883:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: store <4 x float> [[REG881]], <4 x float>* [[REG884:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: store i32 [[REG882]], i32* [[REG885:[0-9a-zA-Z_%.]+]], align 4
+// CHECK-NEXT: [[REG886:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG885]], align 4
+// CHECK-NEXT: [[REG887:[0-9a-zA-Z_%.]+]] = and i32 [[REG886]], 3
+// CHECK-NEXT: [[REG888:[0-9a-zA-Z_%.]+]] = sext i32 [[REG887]] to i64
+// CHECK-NEXT: store i64 [[REG888]], i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG889:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG885]], align 4
+// CHECK-NEXT: [[REG890:[0-9a-zA-Z_%.]+]] = ashr i32 [[REG889]], 2
+// CHECK-NEXT: [[REG891:[0-9a-zA-Z_%.]+]] = and i32 [[REG890]], 3
+// CHECK-NEXT: [[REG892:[0-9a-zA-Z_%.]+]] = sext i32 [[REG891]] to i64
+// CHECK-NEXT: store i64 [[REG892]], i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG893:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG885]], align 4
+// CHECK-NEXT: [[REG894:[0-9a-zA-Z_%.]+]] = ashr i32 [[REG893]], 4
+// CHECK-NEXT: [[REG895:[0-9a-zA-Z_%.]+]] = and i32 [[REG894]], 3
+// CHECK-NEXT: [[REG896:[0-9a-zA-Z_%.]+]] = sext i32 [[REG895]] to i64
+// CHECK-NEXT: store i64 [[REG896]], i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG897:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG885]], align 4
+// CHECK-NEXT: [[REG898:[0-9a-zA-Z_%.]+]] = ashr i32 [[REG897]], 6
+// CHECK-NEXT: [[REG899:[0-9a-zA-Z_%.]+]] = and i32 [[REG898]], 3
+// CHECK-NEXT: [[REG900:[0-9a-zA-Z_%.]+]] = sext i32 [[REG899]] to i64
+// CHECK-NEXT: store i64 [[REG900]], i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG901:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG902:[0-9a-zA-Z_%.]+]] = getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.permute_selectors, i64 0, i64 [[REG901]]
+// CHECK-NEXT: [[REG903:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG902]], align 4
+// CHECK-NEXT: [[REG904:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG905:[0-9a-zA-Z_%.]+]] = insertelement <4 x i32> [[REG904]], i32 [[REG903]], i32 0
+// CHECK-NEXT: store <4 x i32> [[REG905]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG906:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG907:[0-9a-zA-Z_%.]+]] = getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.permute_selectors, i64 0, i64 [[REG906]]
+// CHECK-NEXT: [[REG908:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG907]], align 4
+// CHECK-NEXT: [[REG909:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG910:[0-9a-zA-Z_%.]+]] = insertelement <4 x i32> [[REG909]], i32 [[REG908]], i32 1
+// CHECK-NEXT: store <4 x i32> [[REG910]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG911:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG912:[0-9a-zA-Z_%.]+]] = getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.permute_selectors, i64 0, i64 [[REG911]]
+// CHECK-NEXT: [[REG913:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG912]], align 4
+// CHECK-NEXT: [[REG914:[0-9a-zA-Z_%.]+]] = add i32 [[REG913]], 269488144
+// CHECK-NEXT: [[REG915:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG916:[0-9a-zA-Z_%.]+]] = insertelement <4 x i32> [[REG915]], i32 [[REG914]], i32 2
+// CHECK-NEXT: store <4 x i32> [[REG916]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG917:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG918:[0-9a-zA-Z_%.]+]] = getelementptr inbounds [4 x i32], [4 x i32]* @_mm_shuffle_ps.permute_selectors, i64 0, i64 [[REG917]]
+// CHECK-NEXT: [[REG919:[0-9a-zA-Z_%.]+]] = load i32, i32* [[REG918]], align 4
+// CHECK-NEXT: [[REG920:[0-9a-zA-Z_%.]+]] = add i32 [[REG919]], 269488144
+// CHECK-NEXT: [[REG921:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG922:[0-9a-zA-Z_%.]+]] = insertelement <4 x i32> [[REG921]], i32 [[REG920]], i32 3
+// CHECK-NEXT: store <4 x i32> [[REG922]], <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG923:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG924:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG925:[0-9a-zA-Z_%.]+]] = load <4 x i32>, <4 x i32>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG926:[0-9a-zA-Z_%.]+]] = bitcast <4 x i32> [[REG925]] to <16 x i8>
+// CHECK-NEXT: [[REG927:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_perm(float vector[4], float vector[4], unsigned char vector[16])(<4 x float> [[REG923]], <4 x float> [[REG924]], <16 x i8> [[REG926]])
+// CHECK-NEXT: ret <4 x float> [[REG927]]
+
+// CHECK: define available_externally i64 @_m_pshufw
+// CHECK: [[REG928:[0-9a-zA-Z_%.]+]] = call i64 @_mm_shuffle_pi16
+// CHECK-NEXT: ret i64 [[REG928]]
+
+void __attribute__((noinline))
+test_sqrt() {
+  res = _mm_sqrt_ps(m1);
+  res = _mm_sqrt_ss(m1);
+}
+
+// CHECK-LABEL: @test_sqrt
+
+// CHECK: define available_externally <4 x float> @_mm_sqrt_ps
+// CHECK: [[REG929:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sqrt(float vector[4])(<4 x float> {{[0-9a-zA-Z_%.]+}})
+// CHECK-NEXT: ret <4 x float> [[REG929]]
+
+// CHECK: define available_externally <4 x float> @_mm_sqrt_ss
+// CHECK: [[REG930:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG931:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> [[REG930]], i32 zeroext 0)
+// CHECK-NEXT: store <4 x float> [[REG931]], <4 x float>* [[REG932:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG933:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG932]], align 16
+// CHECK-NEXT: [[REG934:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sqrt(float vector[4])(<4 x float> [[REG933]])
+// CHECK-NEXT: store <4 x float> [[REG934]], <4 x float>* [[REG935:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG936:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG937:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG935]], align 16
+// CHECK-NEXT: [[REG938:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> [[REG936]], <4 x float> [[REG937]], <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG938]]
+
+void __attribute__((noinline))
+test_store() {
+  _mm_store_ps(fs, m1);
+  _mm_store_ps1(fs, m1);
+  _mm_store_ss(fs, m1);
+  _mm_store1_ps(fs, m1);
+  _mm_storeh_pi(ms, m1);
+  _mm_storel_pi(ms, m1);
+  _mm_storer_ps(fs, m1);
+}
+
+// CHECK-LABEL: @test_store
+
+// CHECK: define available_externally void @_mm_store_ps
+// CHECK: store float* {{[0-9a-zA-Z_%.]+}}, float** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG939:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG940:[0-9a-zA-Z_%.]+]] = load float*, float** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG941:[0-9a-zA-Z_%.]+]] = bitcast float* [[REG940]] to <4 x float>*
+// CHECK-NEXT: call void @vec_st(float vector[4], int, float vector[4]*)(<4 x float> [[REG939]], i32 signext 0, <4 x float>* [[REG941]])
+// CHECK-NEXT: ret void
+
+// CHECK: define available_externally void @_mm_store_ps1
+// CHECK: store float* {{[0-9a-zA-Z_%.]+}}, float** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG942:[0-9a-zA-Z_%.]+]] = load float*, float** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG943:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: call void @_mm_store1_ps(float* [[REG942]], <4 x float> [[REG943]])
+// CHECK-NEXT: ret void
+
+// CHECK: define available_externally void @_mm_store_ss
+// CHECK: store float* {{[0-9a-zA-Z_%.]+}}, float** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG944:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG945:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG944]], i32 0
+// CHECK-NEXT: [[REG946:[0-9a-zA-Z_%.]+]] = load float*, float** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store float [[REG945]], float* [[REG946]], align 4
+// CHECK-NEXT: ret void
+
+// CHECK: define available_externally void @_mm_store1_ps
+// CHECK: store float* {{[0-9a-zA-Z_%.]+}}, float** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG947:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG948:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> [[REG947]], i32 zeroext 0)
+// CHECK-NEXT: store <4 x float> [[REG948]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG949:[0-9a-zA-Z_%.]+]] = load float*, float** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG950:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: call void @_mm_store_ps(float* [[REG949]], <4 x float> [[REG950]])
+// CHECK-NEXT: ret void
+
+// CHECK: define available_externally void @_mm_storeh_pi
+// CHECK: store i64* {{[0-9a-zA-Z_%.]+}}, i64** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG951:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG952:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG951]] to <2 x i64>
+// CHECK-NEXT: store <2 x i64> [[REG952]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG953:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG954:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG953]], i32 1
+// CHECK-NEXT: [[REG955:[0-9a-zA-Z_%.]+]] = load i64*, i64** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store i64 [[REG954]], i64* [[REG955]], align 8
+// CHECK-NEXT: ret void
+
+// CHECK: define available_externally void @_mm_storel_pi
+// CHECK: store i64* {{[0-9a-zA-Z_%.]+}}, i64** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG956:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG957:[0-9a-zA-Z_%.]+]] = bitcast <4 x float> [[REG956]] to <2 x i64>
+// CHECK-NEXT: store <2 x i64> [[REG957]], <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG958:[0-9a-zA-Z_%.]+]] = load <2 x i64>, <2 x i64>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG959:[0-9a-zA-Z_%.]+]] = extractelement <2 x i64> [[REG958]], i32 0
+// CHECK-NEXT: [[REG960:[0-9a-zA-Z_%.]+]] = load i64*, i64** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store i64 [[REG959]], i64* [[REG960]], align 8
+// CHECK-NEXT: ret void
+
+// CHECK: define available_externally void @_mm_storer_ps
+// CHECK: store float* {{[0-9a-zA-Z_%.]+}}, float** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG961:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG962:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG963:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_perm(float vector[4], float vector[4], unsigned char vector[16])(<4 x float> [[REG961]], <4 x float> [[REG962]], <16 x i8> <i8 28, i8 29, i8 30, i8 31, i8 24, i8 25, i8 26, i8 27, i8 20, i8 21, i8 22, i8 23, i8 16, i8 17, i8 18, i8 19>)
+// CHECK-NEXT: store <4 x float> [[REG963]], <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG964:[0-9a-zA-Z_%.]+]] = load float*, float** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG965:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: call void @_mm_store_ps(float* [[REG964]], <4 x float> [[REG965]])
+// CHECK-NEXT: ret void
+
+void __attribute__((noinline))
+test_stream() {
+  _mm_stream_pi(&res64, ms[0]);
+  _mm_stream_ps(&fs[0], m1);
+}
+
+// CHECK-LABEL: @test_stream
+
+/// CHECK: define available_externally void @_mm_stream_pi
+// CHECK: store i64* {{[0-9a-zA-Z_%.]+}}, i64** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store i64 {{[0-9a-zA-Z_%.]+}}, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG966:[0-9a-zA-Z_%.]+]] = load i64*, i64** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: call void asm sideeffect "\09dcbtstt\090,$0", "b,~{memory}"(i64* [[REG966]])
+// CHECK-NEXT: [[REG967:[0-9a-zA-Z_%.]+]] = load i64, i64* {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG968:[0-9a-zA-Z_%.]+]] = load i64*, i64** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store i64 [[REG967]], i64* [[REG968]], align 8
+// CHECK-NEXT: ret void
+
+// CHECK: define available_externally void @_mm_stream_ps
+// CHECK: store float* {{[0-9a-zA-Z_%.]+}}, float** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG969:[0-9a-zA-Z_%.]+]] = load float*, float** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: call void asm sideeffect "\09dcbtstt\090,$0", "b,~{memory}"(float* [[REG969]])
+// CHECK-NEXT: [[REG970:[0-9a-zA-Z_%.]+]] = load float*, float** {{[0-9a-zA-Z_%.]+}}, align 8
+// CHECK-NEXT: [[REG971:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: call void @_mm_store_ps(float* [[REG970]], <4 x float> [[REG971]])
+// CHECK-NEXT: ret void
+
+void __attribute__((noinline))
+test_sub() {
+  res = _mm_sub_ps(m1, m2);
+  res = _mm_sub_ss(m1, m2);
+}
+
+// CHECK-LABEL: @test_sub
+
+// CHECK: define available_externally <4 x float> @_mm_sub_ps(<4 x float> [[REG972:[0-9a-zA-Z_%.]+]], <4 x float> [[REG973:[0-9a-zA-Z_%.]+]])
+// CHECK: store <4 x float> [[REG972]], <4 x float>* [[REG974:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: store <4 x float> [[REG973]], <4 x float>* [[REG975:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG976:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG974]], align 16
+// CHECK-NEXT: [[REG977:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG975]], align 16
+// CHECK-NEXT: [[REG978:[0-9a-zA-Z_%.]+]] = fsub <4 x float> [[REG976]], [[REG977]]
+// CHECK-NEXT: ret <4 x float> [[REG978]]
+
+// CHECK: define available_externally <4 x float> @_mm_sub_ss(<4 x float> [[REG979:[0-9a-zA-Z_%.]+]], <4 x float> [[REG980:[0-9a-zA-Z_%.]+]])
+// CHECK: store <4 x float> [[REG979]], <4 x float>* [[REG981:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: store <4 x float> [[REG980]], <4 x float>* [[REG982:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG983:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG981]], align 16
+// CHECK-NEXT: [[REG984:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> [[REG983]], i32 zeroext 0)
+// CHECK-NEXT: store <4 x float> [[REG984]], <4 x float>* [[REG985:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG986:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG982]], align 16
+// CHECK-NEXT: [[REG987:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_splat(float vector[4], unsigned int)(<4 x float> [[REG986]], i32 zeroext 0)
+// CHECK-NEXT: store <4 x float> [[REG987]], <4 x float>* [[REG988:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG989:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG985:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG990:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG988:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG991:[0-9a-zA-Z_%.]+]] = fsub <4 x float> [[REG989]], [[REG990]]
+// CHECK-NEXT: store <4 x float> [[REG991]], <4 x float>* [[REG992:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG993:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG981]], align 16
+// CHECK-NEXT: [[REG994:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG992]], align 16
+// CHECK-NEXT: [[REG995:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_sel(float vector[4], float vector[4], unsigned int vector[4])(<4 x float> [[REG993]], <4 x float> [[REG994]], <4 x i32> <i32 -1, i32 0, i32 0, i32 0>)
+// CHECK-NEXT: ret <4 x float> [[REG995]]
+
+void __attribute__((noinline))
+test_transpose() {
+  __m128 m3, m4;
+  _MM_TRANSPOSE4_PS(m1, m2, m3, m4);
+}
+
+// CHECK-LABEL: @test_transpose
+
+// CHECK: br label %[[REG996:[0-9a-zA-Z_%.]+]]
+// CHECK: [[REG996]]:
+// CHECK: [[REG997:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_vmrghw(float vector[4], float vector[4])
+// CHECK: [[REG998:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_vmrghw(float vector[4], float vector[4])
+// CHECK: [[REG999:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_vmrglw(float vector[4], float vector[4])
+// CHECK: [[REG1000:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_vmrglw(float vector[4], float vector[4])
+// CHECK: [[REG1001:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_mergeh(long long vector[2], long long vector[2])
+// CHECK: [[REG1002:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_mergel(long long vector[2], long long vector[2])
+// CHECK: [[REG1003:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_mergeh(long long vector[2], long long vector[2])
+// CHECK: [[REG1004:[0-9a-zA-Z_%.]+]] = call <2 x i64> @vec_mergel(long long vector[2], long long vector[2])
+// CHECK: ret void
+
+void __attribute__((noinline))
+test_ucomi() {
+  i = _mm_ucomieq_ss(m1, m2);
+  i = _mm_ucomige_ss(m1, m2);
+  i = _mm_ucomigt_ss(m1, m2);
+  i = _mm_ucomile_ss(m1, m2);
+  i = _mm_ucomilt_ss(m1, m2);
+  i = _mm_ucomineq_ss(m1, m2);
+}
+
+// CHECK-LABEL: @test_ucomi
+
+// CHECK: define available_externally signext i32 @_mm_ucomieq_ss
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1005:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1006:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG1005]], i32 0
+// CHECK-NEXT: [[REG1007:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1008:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG1007]], i32 0
+// CHECK-NEXT: [[REG1009:[0-9a-zA-Z_%.]+]] = fcmp oeq float [[REG1006]], [[REG1008]]
+// CHECK-NEXT: [[REG1010:[0-9a-zA-Z_%.]+]] = zext i1 [[REG1009]] to i32
+// CHECK-NEXT: ret i32 [[REG1010]]
+
+// CHECK: define available_externally signext i32 @_mm_ucomige_ss
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1011:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1012:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG1011]], i32 0
+// CHECK-NEXT: [[REG1013:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1014:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG1013]], i32 0
+// CHECK-NEXT: [[REG1015:[0-9a-zA-Z_%.]+]] = fcmp oge float [[REG1012]], [[REG1014]]
+// CHECK-NEXT: [[REG1016:[0-9a-zA-Z_%.]+]] = zext i1 [[REG1015]] to i32
+// CHECK-NEXT: ret i32 [[REG1016]]
+
+// CHECK: define available_externally signext i32 @_mm_ucomigt_ss
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1017:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1018:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG1017]], i32 0
+// CHECK-NEXT: [[REG1019:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1020:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG1019]], i32 0
+// CHECK-NEXT: [[REG1021:[0-9a-zA-Z_%.]+]] = fcmp ogt float [[REG1018]], [[REG1020]]
+// CHECK-NEXT: [[REG1022:[0-9a-zA-Z_%.]+]] = zext i1 [[REG1021]] to i32
+// CHECK-NEXT: ret i32 [[REG1022]]
+
+// CHECK: define available_externally signext i32 @_mm_ucomile_ss
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1023:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1024:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG1023]], i32 0
+// CHECK-NEXT: [[REG1025:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1026:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG1025]], i32 0
+// CHECK-NEXT: [[REG1027:[0-9a-zA-Z_%.]+]] = fcmp ole float [[REG1024]], [[REG1026]]
+// CHECK-NEXT: [[REG1028:[0-9a-zA-Z_%.]+]] = zext i1 [[REG1027]] to i32
+// CHECK-NEXT: ret i32 [[REG1028]]
+
+// CHECK: define available_externally signext i32 @_mm_ucomilt_ss
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1029:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1030:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG1029]], i32 0
+// CHECK-NEXT: [[REG1031:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1032:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG1031]], i32 0
+// CHECK-NEXT: [[REG1033:[0-9a-zA-Z_%.]+]] = fcmp olt float [[REG1030]], [[REG1032]]
+// CHECK-NEXT: [[REG1034:[0-9a-zA-Z_%.]+]] = zext i1 [[REG1033]] to i32
+// CHECK-NEXT: ret i32 [[REG1034]]
+
+// CHECK: define available_externally signext i32 @_mm_ucomineq_ss
+// CHECK: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: store <4 x float> {{[0-9a-zA-Z_%.]+}}, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1035:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1036:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG1035]], i32 0
+// CHECK-NEXT: [[REG1037:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* {{[0-9a-zA-Z_%.]+}}, align 16
+// CHECK-NEXT: [[REG1038:[0-9a-zA-Z_%.]+]] = extractelement <4 x float> [[REG1037]], i32 0
+// CHECK-NEXT: [[REG1039:[0-9a-zA-Z_%.]+]] = fcmp une float [[REG1036]], [[REG1038]]
+// CHECK-NEXT: [[REG1040:[0-9a-zA-Z_%.]+]] = zext i1 [[REG1039]] to i32
+// CHECK-NEXT: ret i32 [[REG1040]]
+
+void __attribute__((noinline))
+test_undefined() {
+  res = _mm_undefined_ps();
+}
+
+// CHECK-LABEL: @test_undefined
+
+// CHECK: define available_externally <4 x float> @_mm_undefined_ps
+// CHECK: [[REG1041:[0-9a-zA-Z_%.]+]] = alloca <4 x float>, align 16
+// CHECK-NEXT: [[REG1042:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG1041]], align 16
+// CHECK-NEXT: store <4 x float> [[REG1042]], <4 x float>* [[REG1041]], align 16
+// CHECK-NEXT: [[REG1043:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG1041]], align 16
+// CHECK-NEXT: ret <4 x float> [[REG1043]]
+
+void __attribute__((noinline))
+test_unpack() {
+  res = _mm_unpackhi_ps(m1, m2);
+  res = _mm_unpacklo_ps(m1, m2);
+}
+
+// CHECK-LABEL: @test_unpack
+
+// CHECK: define available_externally <4 x float> @_mm_unpackhi_ps(<4 x float> [[REG1044:[0-9a-zA-Z_%.]+]], <4 x float> [[REG1045:[0-9a-zA-Z_%.]+]])
+// CHECK: store <4 x float> [[REG1044]], <4 x float>* [[REG1046:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: store <4 x float> [[REG1045]], <4 x float>* [[REG1047:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG1048:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG1046]], align 16
+// CHECK-NEXT: [[REG1049:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG1047]], align 16
+// CHECK-NEXT: [[REG1050:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_vmrglw(float vector[4], float vector[4])(<4 x float> [[REG1048]], <4 x float> [[REG1049]])
+// CHECK-NEXT: ret <4 x float> [[REG1050]]
+
+// CHECK: define available_externally <4 x float> @_mm_unpacklo_ps(<4 x float> [[REG1051:[0-9a-zA-Z_%.]+]], <4 x float> [[REG1052:[0-9a-zA-Z_%.]+]])
+// CHECK: store <4 x float> [[REG1051]], <4 x float>* [[REG1053:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: store <4 x float> [[REG1052]], <4 x float>* [[REG1054:[0-9a-zA-Z_%.]+]], align 16
+// CHECK-NEXT: [[REG1055:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG1053]], align 16
+// CHECK-NEXT: [[REG1056:[0-9a-zA-Z_%.]+]] = load <4 x float>, <4 x float>* [[REG1054]], align 16
+// CHECK-NEXT: [[REG1057:[0-9a-zA-Z_%.]+]] = call <4 x float> @vec_vmrghw(float vector[4], float vector[4])(<4 x float> [[REG1055]], <4 x float> [[REG1056]])
+// CHECK-NEXT: ret <4 x float> [[REG1057]]
diff --git a/test/Headers/ppc-intrinsics.c b/test/Headers/ppc-intrinsics.c

deleted file mode 100644 (file)

index 622ce90..0000000
--- a/test/Headers/ppc-intrinsics.c
+++ /dev/null
@@ -1,13 +0,0 @@
-// REQUIRES: powerpc-registered-target
-
-// RUN: %clang -S -emit-llvm -DNO_WARN_X86_INTRINSICS -target powerpc64-gnu-linux %s -Xclang -verify -o - | FileCheck %s
-// RUN: %clang -S -emit-llvm -DNO_WARN_X86_INTRINSICS -target powerpc64-gnu-linux %s -Xclang -verify -x c++ -o - | FileCheck %s
-// expected-no-diagnostics
-
-// RUN: not %clang -S -emit-llvm -target powerpc64-gnu-linux %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-ERROR
-
-#include <mmintrin.h>
-// CHECK-ERROR: mmintrin.h:{{[0-9]+}}:{{[0-9]+}}: error: "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
-
-// CHECK: target triple = "powerpc64-
-// CHECK: !llvm.module.flags =
diff --git a/test/Headers/ppc-mmx-intrinsics.c b/test/Headers/ppc-mmx-intrinsics.c

new file mode 100644 (file)

index 0000000..406694d
--- /dev/null
+++ b/test/Headers/ppc-mmx-intrinsics.c
@@ -0,0 +1,11 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: powerpc-registered-target
+
+// RUN: %clang -S -emit-llvm -DNO_WARN_X86_INTRINSICS -mcpu=pwr7 -target powerpc64-unknown-linux-gnu %s -Xclang -verify
+// RUN: %clang -S -emit-llvm -DNO_WARN_X86_INTRINSICS -mcpu=pwr7 -target powerpc64-unknown-linux-gnu %s -Xclang -verify -x c++
+// expected-no-diagnostics
+
+// RUN: not %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr7 %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=CHECK-ERROR
+
+#include <mmintrin.h>
+// CHECK-ERROR: mmintrin.h:{{[0-9]+}}:{{[0-9]+}}: error: "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
diff --git a/test/Headers/ppc-sse-intrinsics.c b/test/Headers/ppc-sse-intrinsics.c

new file mode 100644 (file)

index 0000000..91906f0
--- /dev/null
+++ b/test/Headers/ppc-sse-intrinsics.c
@@ -0,0 +1,22 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: powerpc-registered-target
+
+// Since mm_malloc.h references system native stdlib.h, doing cross-compile
+// testing may cause unexpected problems. This would affect xmmintrin.h and
+// other following intrinsics headers. If there's need to test them using
+// cross-compile, please add -ffreestanding to compiler options, like
+// test/CodeGen/ppc-xmmintrin.c.
+
+// RUN: %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm -DNO_WARN_X86_INTRINSICS %s -mcpu=pwr7 -Xclang -verify
+// RUN: %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm -DNO_WARN_X86_INTRINSICS %s -mcpu=pwr7 -Xclang -verify -x c++
+// expected-no-diagnostics
+
+// RUN: not %clang -target powerpc64-unknown-linux-gnu -S -emit-llvm %s -mcpu=pwr7 -o /dev/null 2>&1 | FileCheck %s -check-prefix=SSE-ERROR
+
+// Don't include mm_malloc.h, it's system specific.
+#define _MM_MALLOC_H_INCLUDED
+
+// Altivec must be enabled.
+#include <xmmintrin.h>
+
+// SSE-ERROR: xmmintrin.h:{{[0-9]+}}:{{[0-9]+}}: error: "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
author	Zi Xuan Wu <wuzish@cn.ibm.com>
	Fri, 31 May 2019 04:42:13 +0000 (04:42 +0000)
committer	Zi Xuan Wu <wuzish@cn.ibm.com>
	Fri, 31 May 2019 04:42:13 +0000 (04:42 +0000)
lib/Headers/CMakeLists.txt		patch \| blob \| history
lib/Headers/ppc_wrappers/mm_malloc.h	[new file with mode: 0644]	patch \| blob
lib/Headers/ppc_wrappers/xmmintrin.h	[new file with mode: 0644]	patch \| blob
test/CodeGen/ppc-mm-malloc-le.c	[new file with mode: 0644]	patch \| blob
test/CodeGen/ppc-mm-malloc.c	[new file with mode: 0644]	patch \| blob
test/CodeGen/ppc-mmintrin.c		patch \| blob \| history
test/CodeGen/ppc-xmmintrin.c	[new file with mode: 0644]	patch \| blob
test/Headers/ppc-intrinsics.c	[deleted file]	patch \| blob \| history
test/Headers/ppc-mmx-intrinsics.c	[new file with mode: 0644]	patch \| blob
test/Headers/ppc-sse-intrinsics.c	[new file with mode: 0644]	patch \| blob