From 85631f9bbcd4b92ffc760b3d768b5826e8093a65 Mon Sep 17 00:00:00 2001
From: Adam Nemet <anemet@apple.com>
Date: Wed, 14 Jan 2015 01:31:17 +0000
Subject: [PATCH] [AVX512] Add FP unpack intrinsics

These are implemented with __builtin_shufflevector just like AVX.

We have some tests on the LLVM side to assert that these shufflevectors do
indeed generate the corresponding unpck instruction.

Part of <rdar://problem/17688758>

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@225922 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Headers/avx512fintrin.h     | 33 +++++++++++++++++++++++++++++++++
 test/CodeGen/avx512f-builtins.c | 28 ++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)
diff --git a/lib/Headers/avx512fintrin.h b/lib/Headers/avx512fintrin.h
index 9591dcf37a..9c80710110 100644
--- a/lib/Headers/avx512fintrin.h
+++ b/lib/Headers/avx512fintrin.h
@@ -839,6 +839,39 @@ _mm512_cvt_roundpd_epu32(__m512d __A, const int __R)
                 __R);
 }
 
+/* Unpack and Interleave */
+static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_unpackhi_pd(__m512d __a, __m512d __b)
+{
+  return __builtin_shufflevector(__a, __b, 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
+}
+
+static __inline __m512d __attribute__((__always_inline__, __nodebug__))
+_mm512_unpacklo_pd(__m512d __a, __m512d __b)
+{
+  return __builtin_shufflevector(__a, __b, 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
+}
+
+static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_unpackhi_ps(__m512 __a, __m512 __b)
+{
+  return __builtin_shufflevector(__a, __b,
+                                 2,    18,    3,    19,
+                                 2+4,  18+4,  3+4,  19+4,
+                                 2+8,  18+8,  3+8,  19+8,
+                                 2+12, 18+12, 3+12, 19+12);
+}
+
+static __inline __m512 __attribute__((__always_inline__, __nodebug__))
+_mm512_unpacklo_ps(__m512 __a, __m512 __b)
+{
+  return __builtin_shufflevector(__a, __b,
+                                 0,    16,    1,    17,
+                                 0+4,  16+4,  1+4,  17+4,
+                                 0+8,  16+8,  1+8,  17+8,
+                                 0+12, 16+12, 1+12, 17+12);
+}
+
 /* Bit Test */
 
 static __inline __mmask16 __attribute__ ((__always_inline__, __nodebug__))
diff --git a/test/CodeGen/avx512f-builtins.c b/test/CodeGen/avx512f-builtins.c
index ba05f4abdc..8bb013fef7 100644
--- a/test/CodeGen/avx512f-builtins.c
+++ b/test/CodeGen/avx512f-builtins.c
@@ -182,3 +182,31 @@ __mmask8 test_mm512_cmpeq_epi64_mask(__m512i __a, __m512i __b) {
   // CHECK: @llvm.x86.avx512.mask.pcmpeq.q.512
   return (__mmask8)_mm512_cmpeq_epi64_mask(__a, __b);
 }
+
+__m512d test_mm512_unpackhi_pd(__m512d a, __m512d b)
+{
+  // CHECK-LABEL: @test_mm512_unpackhi_pd
+  // CHECK: shufflevector <8 x double> {{.*}} <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  return _mm512_unpackhi_pd(a, b);
+}
+
+__m512d test_mm512_unpacklo_pd(__m512d a, __m512d b)
+{
+  // CHECK-LABEL: @test_mm512_unpacklo_pd
+  // CHECK: shufflevector <8 x double> {{.*}} <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  return _mm512_unpacklo_pd(a, b);
+}
+
+__m512d test_mm512_unpackhi_ps(__m512d a, __m512d b)
+{
+  // CHECK-LABEL: @test_mm512_unpackhi_ps
+  // CHECK: shufflevector <16 x float> {{.*}} <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  return _mm512_unpackhi_ps(a, b);
+}
+
+__m512d test_mm512_unpacklo_ps(__m512d a, __m512d b)
+{
+  // CHECK-LABEL: @test_mm512_unpacklo_ps
+  // CHECK: shufflevector <16 x float> {{.*}} <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  return _mm512_unpacklo_ps(a, b);
+}
-- 
2.40.0