[X86] Use native shuffle vector for the perm2f128 intrinsics

author Craig Topper <craig.topper@intel.com>

Fri, 15 Sep 2017 23:00:59 +0000 (23:00 +0000)

committer Craig Topper <craig.topper@intel.com>

Fri, 15 Sep 2017 23:00:59 +0000 (23:00 +0000)
author Craig Topper <craig.topper@intel.com>
Fri, 15 Sep 2017 23:00:59 +0000 (23:00 +0000)
committer Craig Topper <craig.topper@intel.com>
Fri, 15 Sep 2017 23:00:59 +0000 (23:00 +0000)
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp

index 4240d2de53660c495c1d123655e47dbc5494be85..3e353ea7eca6318a6de065b57faebb8df3fc4952 100644 (file)
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -7923,6 +7923,45 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
      return EmitX86Select(*this, Ops[4], Align, Ops[3]);
    }
  
+  case X86::BI__builtin_ia32_vperm2f128_pd256:
+  case X86::BI__builtin_ia32_vperm2f128_ps256:
+  case X86::BI__builtin_ia32_vperm2f128_si256:
+  case X86::BI__builtin_ia32_permti256: {
+    unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
+    unsigned NumElts = Ops[0]->getType()->getVectorNumElements();
+
+    // This takes a very simple approach since there are two lanes and a
+    // shuffle can have 2 inputs. So we reserve the first input for the first
+    // lane and the second input for the second lane. This may result in
+    // duplicate sources, but this can be dealt with in the backend.
+
+    Value *OutOps[2];
+    uint32_t Indices[8];
+    for (unsigned l = 0; l != 2; ++l) {
+      // Determine the source for this lane.
+      if (Imm & (1 << ((l * 4) + 3)))
+        OutOps[l] = llvm::ConstantAggregateZero::get(Ops[0]->getType());
+      else if (Imm & (1 << ((l * 4) + 1)))
+        OutOps[l] = Ops[1];
+      else
+        OutOps[l] = Ops[0];
+
+      for (unsigned i = 0; i != NumElts/2; ++i) {
+        // Start with ith element of the source for this lane.
+        unsigned Idx = (l * NumElts) + i;
+        // If bit 0 of the immediate half is set, switch to the high half of
+        // the source.
+        if (Imm & (1 << (l * 4)))
+          Idx += NumElts/2;
+        Indices[(l * (NumElts/2)) + i] = Idx;
+      }
+    }
+
+    return Builder.CreateShuffleVector(OutOps[0], OutOps[1],
+                                       makeArrayRef(Indices, NumElts),
+                                       "vperm");
+  }
+
    case X86::BI__builtin_ia32_movnti:
    case X86::BI__builtin_ia32_movnti64:
    case X86::BI__builtin_ia32_movntsd:
diff --git a/test/CodeGen/avx-builtins.c b/test/CodeGen/avx-builtins.c

index 31a08440d0611c2013cf92254d529bf4e6735e4a..4e77ad166ce06410a058d7e994e32c42c22190c6 100644 (file)
--- a/test/CodeGen/avx-builtins.c
+++ b/test/CodeGen/avx-builtins.c
@@ -678,19 +678,19 @@ __m256 test_mm256_permute_ps(__m256 A) {
  
  __m256d test_mm256_permute2f128_pd(__m256d A, __m256d B) {
    // CHECK-LABEL: test_mm256_permute2f128_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i8 49)
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
    return _mm256_permute2f128_pd(A, B, 0x31);
  }
  
  __m256 test_mm256_permute2f128_ps(__m256 A, __m256 B) {
    // CHECK-LABEL: test_mm256_permute2f128_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i8 19)
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
    return _mm256_permute2f128_ps(A, B, 0x13);
  }
  
  __m256i test_mm256_permute2f128_si256(__m256i A, __m256i B) {
    // CHECK-LABEL: test_mm256_permute2f128_si256
-  // CHECK: call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, i8 32)
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
    return _mm256_permute2f128_si256(A, B, 0x20);
  }
  
diff --git a/test/CodeGen/avx2-builtins.c b/test/CodeGen/avx2-builtins.c

index bf285821f575bd8388416b9aaf39485b77cd4ec8..f79f60e6db78d972877f9887a8eea3dd7d03007f 100644 (file)
--- a/test/CodeGen/avx2-builtins.c
+++ b/test/CodeGen/avx2-builtins.c
@@ -907,8 +907,8 @@ __m256i test_mm256_packs_epu32(__m256i a, __m256i b) {
  
  __m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {
    // CHECK-LABEL: test_mm256_permute2x128_si256
-  // CHECK: call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, i8 49)
-  return _mm256_permute2x128_si256(a, b, 0x31);
+  // CHECK: shufflevector <4 x i64> zeroinitializer, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  return _mm256_permute2x128_si256(a, b, 0x38);
  }
  
  __m256i test_mm256_permute4x64_epi64(__m256i a) {
author	Craig Topper <craig.topper@intel.com>
	Fri, 15 Sep 2017 23:00:59 +0000 (23:00 +0000)
committer	Craig Topper <craig.topper@intel.com>
	Fri, 15 Sep 2017 23:00:59 +0000 (23:00 +0000)
lib/CodeGen/CGBuiltin.cpp		patch \| blob \| history
test/CodeGen/avx-builtins.c		patch \| blob \| history
test/CodeGen/avx2-builtins.c		patch \| blob \| history