[X86] Remove VPERM2F128/VPERM2I128 intrinsics and autoupgrade to native shuffles.

author Craig Topper <craig.topper@intel.com>

Sat, 16 Sep 2017 07:36:14 +0000 (07:36 +0000)

committer Craig Topper <craig.topper@intel.com>

Sat, 16 Sep 2017 07:36:14 +0000 (07:36 +0000)
author Craig Topper <craig.topper@intel.com>
Sat, 16 Sep 2017 07:36:14 +0000 (07:36 +0000)
committer Craig Topper <craig.topper@intel.com>
Sat, 16 Sep 2017 07:36:14 +0000 (07:36 +0000)
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td

index 07b5350c45320961b6c528a0bde8ea0ca3cac854..fe63072275563c012812ca4c88b942478de7cecc 100644 (file)
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -965,17 +965,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
          GCCBuiltin<"__builtin_ia32_vpermilvarps256">,
          Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8i32_ty], [IntrNoMem]>;
  
-  // TODO: Remove and autoupgrade using implementation in CGBuiltins.cpp
-  def int_x86_avx_vperm2f128_pd_256 :
-        Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
-                  llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx_vperm2f128_ps_256 :
-        Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
-                  llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx_vperm2f128_si_256 :
-        Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
-                  llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-
    def int_x86_avx512_mask_vpermi2var_d_128 :
         GCCBuiltin<"__builtin_ia32_vpermi2vard128_mask">,
          Intrinsic<[llvm_v4i32_ty],
@@ -1950,10 +1939,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
    def int_x86_avx2_permps : GCCBuiltin<"__builtin_ia32_permvarsf256">,
                Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8i32_ty],
                          [IntrNoMem]>;
-  // TODO: Remove and autoupgrade using implementation in CGBuiltins.cpp
-  def int_x86_avx2_vperm2i128 :
-              Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
-                         llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
  }
  
  // Conditional load ops
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp

index cf0097fa10e17a12532c80d47bbaecea09725763..2e8a41e06e456847661cfa31a09eda6bf716ff59 100644 (file)
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -83,6 +83,8 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
        Name.startswith("avx2.pcmpgt.") || // Added in 3.1
        Name.startswith("avx512.mask.pcmpeq.") || // Added in 3.9
        Name.startswith("avx512.mask.pcmpgt.") || // Added in 3.9
+      Name.startswith("avx.vperm2f128.") || // Added in 6.0
+      Name == "avx2.vperm2i128" || // Added in 6.0
        Name == "sse.add.ss" || // Added in 4.0
        Name == "sse2.add.sd" || // Added in 4.0
        Name == "sse.sub.ss" || // Added in 4.0
@@ -1426,6 +1428,42 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
        if (CI->getNumArgOperands() == 4)
          Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                              CI->getArgOperand(2));
+    } else if (IsX86 && (Name.startswith("avx.vperm2f128.") ||
+                         Name == "avx2.vperm2i128")) {
+      // The immediate permute control byte looks like this:
+      //    [1:0] - select 128 bits from sources for low half of destination
+      //    [2]   - ignore
+      //    [3]   - zero low half of destination
+      //    [5:4] - select 128 bits from sources for high half of destination
+      //    [6]   - ignore
+      //    [7]   - zero high half of destination
+
+      uint8_t Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+
+      unsigned NumElts = CI->getType()->getVectorNumElements();
+      unsigned HalfSize = NumElts / 2;
+      SmallVector<uint32_t, 8> ShuffleMask(NumElts);
+
+      // Determine which operand(s) are actually in use for this instruction.
+      Value *V0 = (Imm & 0x02) ? CI->getArgOperand(1) : CI->getArgOperand(0);
+      Value *V1 = (Imm & 0x20) ? CI->getArgOperand(1) : CI->getArgOperand(0);
+
+      // If needed, replace operands based on zero mask.
+      V0 = (Imm & 0x08) ? ConstantAggregateZero::get(CI->getType()) : V0;
+      V1 = (Imm & 0x80) ? ConstantAggregateZero::get(CI->getType()) : V1;
+
+      // Permute low half of result.
+      unsigned StartIndex = (Imm & 0x01) ? HalfSize : 0;
+      for (unsigned i = 0; i < HalfSize; ++i)
+        ShuffleMask[i] = StartIndex + i;
+
+      // Permute high half of result.
+      StartIndex = (Imm & 0x10) ? HalfSize : 0;
+      for (unsigned i = 0; i < HalfSize; ++i)
+        ShuffleMask[i + HalfSize] = NumElts + StartIndex + i;
+
+      Rep = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+
      } else if (IsX86 && (Name.startswith("avx.vpermil.") ||
                           Name == "sse2.pshuf.d" ||
                           Name.startswith("avx512.mask.vpermil.p") ||
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 4df7621bee2bb0ac0afed266271364fef49a6fed..da4aa959ea29d073c80bbaf53a13d29461d13863 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -12174,6 +12174,10 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
    // If either input operand is a zero vector, use VPERM2X128 because its mask
    // allows us to replace the zero input with an implicit zero.
    if (!IsV1Zero && !IsV2Zero) {
+    // With AVX2, use VPERMQ/VPERMPD to allow memory folding.
+    if (Subtarget.hasAVX2() && V2.isUndef())
+      return SDValue();
+
      // Check for patterns which can be matched with a single insert of a 128-bit
      // subvector.
      bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h

index e3ce18c35761b1a541266e402571dde1166ae5ac..4f1b6572875199d22f692a6ac9b3aa7fb69f9b56 100644 (file)
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -365,9 +365,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
    X86_INTRINSIC_DATA(avx_rsqrt_ps_256,  INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
    X86_INTRINSIC_DATA(avx_sqrt_pd_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
    X86_INTRINSIC_DATA(avx_sqrt_ps_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
-  X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
-  X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
-  X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
    X86_INTRINSIC_DATA(avx_vpermilvar_pd,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
    X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
    X86_INTRINSIC_DATA(avx_vpermilvar_ps,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
@@ -424,7 +421,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
    X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
    X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
    X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
-  X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
    X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
    X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
    X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp

index 2012934dbb9b33f32264a94ae27aad8f0a22b650..61f0329f704f5dc809d9243838172c937c3c40ca 100644 (file)
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1094,72 +1094,6 @@ static Value *simplifyX86vpermv(const IntrinsicInst &II,
    return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
  }
  
-/// The shuffle mask for a perm2*128 selects any two halves of two 256-bit
-/// source vectors, unless a zero bit is set. If a zero bit is set,
-/// then ignore that half of the mask and clear that half of the vector.
-static Value *simplifyX86vperm2(const IntrinsicInst &II,
-                                InstCombiner::BuilderTy &Builder) {
-  auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
-  if (!CInt)
-    return nullptr;
-
-  VectorType *VecTy = cast<VectorType>(II.getType());
-  ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
-
-  // The immediate permute control byte looks like this:
-  //    [1:0] - select 128 bits from sources for low half of destination
-  //    [2]   - ignore
-  //    [3]   - zero low half of destination
-  //    [5:4] - select 128 bits from sources for high half of destination
-  //    [6]   - ignore
-  //    [7]   - zero high half of destination
-
-  uint8_t Imm = CInt->getZExtValue();
-
-  bool LowHalfZero = Imm & 0x08;
-  bool HighHalfZero = Imm & 0x80;
-
-  // If both zero mask bits are set, this was just a weird way to
-  // generate a zero vector.
-  if (LowHalfZero && HighHalfZero)
-    return ZeroVector;
-
-  // If 0 or 1 zero mask bits are set, this is a simple shuffle.
-  unsigned NumElts = VecTy->getNumElements();
-  unsigned HalfSize = NumElts / 2;
-  SmallVector<uint32_t, 8> ShuffleMask(NumElts);
-
-  // The high bit of the selection field chooses the 1st or 2nd operand.
-  bool LowInputSelect = Imm & 0x02;
-  bool HighInputSelect = Imm & 0x20;
-
-  // The low bit of the selection field chooses the low or high half
-  // of the selected operand.
-  bool LowHalfSelect = Imm & 0x01;
-  bool HighHalfSelect = Imm & 0x10;
-
-  // Determine which operand(s) are actually in use for this instruction.
-  Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
-  Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
-
-  // If needed, replace operands based on zero mask.
-  V0 = LowHalfZero ? ZeroVector : V0;
-  V1 = HighHalfZero ? ZeroVector : V1;
-
-  // Permute low half of result.
-  unsigned StartIndex = LowHalfSelect ? HalfSize : 0;
-  for (unsigned i = 0; i < HalfSize; ++i)
-    ShuffleMask[i] = StartIndex + i;
-
-  // Permute high half of result.
-  StartIndex = HighHalfSelect ? HalfSize : 0;
-  StartIndex += NumElts;
-  for (unsigned i = 0; i < HalfSize; ++i)
-    ShuffleMask[i + HalfSize] = StartIndex + i;
-
-  return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
-}
-
  /// Decode XOP integer vector comparison intrinsics.
  static Value *simplifyX86vpcom(const IntrinsicInst &II,
                                 InstCombiner::BuilderTy &Builder,
@@ -3018,14 +2952,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
      }
      break;
  
-  case Intrinsic::x86_avx_vperm2f128_pd_256:
-  case Intrinsic::x86_avx_vperm2f128_ps_256:
-  case Intrinsic::x86_avx_vperm2f128_si_256:
-  case Intrinsic::x86_avx2_vperm2i128:
-    if (Value *V = simplifyX86vperm2(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
-
    case Intrinsic::x86_avx_maskload_ps:
    case Intrinsic::x86_avx_maskload_pd:
    case Intrinsic::x86_avx_maskload_ps_256:
diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll

index 4e015c2da85724bafc9d206379033408e848bcc5..7ff43f60bbaa7702aec9e6b646ef0c1cd0fe36a4 100644 (file)
--- a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -762,3 +762,66 @@ define <8 x float> @test_x86_avx_vpermil_ps_256(<8 x float> %a0) {
    ret <8 x float> %res
  }
  declare <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float>, i8) nounwind readnone
+
+
+define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: test_x86_avx_vperm2f128_pd_256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 $33, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x21]
+; CHECK-NEXT:    # ymm0 = ymm1[2,3],ymm0[0,1]
+; CHECK-NEXT:    retl # encoding: [0xc3]
+; X86-LABEL: test_x86_avx_vperm2f128_pd_256:
+; X86:       # BB#0:
+; X86-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx_vperm2f128_pd_256:
+; X64:       # BB#0:
+; X64-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X64-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3) ; <<4 x double>> [#uses=1]
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+
+
+define <8 x float> @test_x86_avx_vperm2f128_ps_256(<8 x float> %a0, <8 x float> %a1) {
+; CHECK-LABEL: test_x86_avx_vperm2f128_ps_256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 $33, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x21]
+; CHECK-NEXT:    # ymm0 = ymm1[2,3],ymm0[0,1]
+; CHECK-NEXT:    retl # encoding: [0xc3]
+; X86-LABEL: test_x86_avx_vperm2f128_ps_256:
+; X86:       # BB#0:
+; X86-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx_vperm2f128_ps_256:
+; X64:       # BB#0:
+; X64-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X64-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 3) ; <<8 x float>> [#uses=1]
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx_vperm2f128_si_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: test_x86_avx_vperm2f128_si_256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 $33, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x21]
+; CHECK-NEXT:    # ymm0 = ymm1[2,3],ymm0[0,1]
+; CHECK-NEXT:    retl # encoding: [0xc3]
+; X86-LABEL: test_x86_avx_vperm2f128_si_256:
+; X86:       # BB#0:
+; X86-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_x86_avx_vperm2f128_si_256:
+; X64:       # BB#0:
+; X64-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X64-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 3) ; <<8 x i32>> [#uses=1]
+  ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll

index 79c6b1f323786f9c10ca9a7deb0bdef082df4681..4e65790bf3d6b457fe423a4841bc8ac66aae54b7 100644 (file)
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -612,42 +612,6 @@ define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) {
  declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
  
  
-define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_pd_256:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 $3, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x03]
-; CHECK-NEXT:    # ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT:    retl # encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3) ; <<4 x double>> [#uses=1]
-  ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
-
-
-define <8 x float> @test_x86_avx_vperm2f128_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_ps_256:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 $3, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x03]
-; CHECK-NEXT:    # ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT:    retl # encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 3) ; <<8 x float>> [#uses=1]
-  ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx_vperm2f128_si_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_si_256:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 $3, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x03]
-; CHECK-NEXT:    # ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT:    retl # encoding: [0xc3]
-  %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 3) ; <<8 x i32>> [#uses=1]
-  ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
-
-
  define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1) {
  ; AVX-LABEL: test_x86_avx_vpermilvar_pd:
  ; AVX:       # BB#0:
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll

index 8f75311db10708a88a45e7180b9ec6284a3f588a..c7babe086f06c19a84ab33e82d19abd1db6f8e7d 100644 (file)
--- a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
@@ -1,6 +1,6 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 | FileCheck %s
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
  
  define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
  ; CHECK-LABEL: test_x86_avx2_pblendw:
@@ -565,3 +565,18 @@ define <16 x i16> @test_x86_avx2_pabs_w(<16 x i16> %a0) {
  }
  declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
  
+
+define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) {
+; AVX2-LABEL: test_x86_avx2_vperm2i128:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT:    retl
+;
+; AVX512-LABEL: test_x86_avx2_vperm2i128:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512-NEXT:    retl
+  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 1) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll

index 2eaa2dc1ae7993dddc8d61618852dbd05b334d11..77039a13ed56fbca4db732be926cad9a63e4a9e5 100644 (file)
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -1002,18 +1002,6 @@ define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) {
  declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
  
  
-define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx2_vperm2i128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vperm2f128 $1, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x01]
-; CHECK-NEXT:    ## ymm0 = ymm0[2,3,0,1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 1) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
-
-
  define <2 x i64> @test_x86_avx2_maskload_q(i8* %a0, <2 x i64> %a1) {
  ; CHECK-LABEL: test_x86_avx2_maskload_q:
  ; CHECK:       ## BB#0:
@@ -1259,18 +1247,18 @@ define <4 x i32> @test_x86_avx2_psrav_d_const(<4 x i32> %a0, <4 x i32> %a1) {
  ; AVX2:       ## BB#0:
  ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
  ; AVX2-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI85_0, kind: FK_Data_4
-; AVX2-NEXT:    vpsravd LCPI85_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; AVX2-NEXT:    ## fixup A - offset: 5, value: LCPI85_1, kind: FK_Data_4
+; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI84_0, kind: FK_Data_4
+; AVX2-NEXT:    vpsravd LCPI84_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; AVX2-NEXT:    ## fixup A - offset: 5, value: LCPI84_1, kind: FK_Data_4
  ; AVX2-NEXT:    retl ## encoding: [0xc3]
  ;
  ; AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
  ; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vmovdqa LCPI85_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
+; AVX512VL-NEXT:    vmovdqa LCPI84_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
  ; AVX512VL-NEXT:    ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI85_0, kind: FK_Data_4
-; AVX512VL-NEXT:    vpsravd LCPI85_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI85_1, kind: FK_Data_4
+; AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI84_0, kind: FK_Data_4
+; AVX512VL-NEXT:    vpsravd LCPI84_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI84_1, kind: FK_Data_4
  ; AVX512VL-NEXT:    retl ## encoding: [0xc3]
    %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> <i32 2, i32 9, i32 -12, i32 23>, <4 x i32> <i32 1, i32 18, i32 35, i32 52>)
    ret <4 x i32> %res
@@ -1296,18 +1284,18 @@ define <8 x i32> @test_x86_avx2_psrav_d_256_const(<8 x i32> %a0, <8 x i32> %a1)
  ; AVX2:       ## BB#0:
  ; AVX2-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
  ; AVX2-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI87_0, kind: FK_Data_4
-; AVX2-NEXT:    vpsravd LCPI87_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; AVX2-NEXT:    ## fixup A - offset: 5, value: LCPI87_1, kind: FK_Data_4
+; AVX2-NEXT:    ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4
+; AVX2-NEXT:    vpsravd LCPI86_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; AVX2-NEXT:    ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4
  ; AVX2-NEXT:    retl ## encoding: [0xc3]
  ;
  ; AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
  ; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vmovdqa LCPI87_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
+; AVX512VL-NEXT:    vmovdqa LCPI86_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
  ; AVX512VL-NEXT:    ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI87_0, kind: FK_Data_4
-; AVX512VL-NEXT:    vpsravd LCPI87_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI87_1, kind: FK_Data_4
+; AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4
+; AVX512VL-NEXT:    vpsravd LCPI86_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; AVX512VL-NEXT:    ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4
  ; AVX512VL-NEXT:    retl ## encoding: [0xc3]
    %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>)
    ret <8 x i32> %res
diff --git a/test/CodeGen/X86/x86-vperm2.ll b/test/CodeGen/X86/x86-vperm2.ll

new file mode 100644 (file)

index 0000000..4558afa
--- /dev/null
+++ b/test/CodeGen/X86/x86-vperm2.ll
@@ -0,0 +1,229 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+
+; Test cases derived from the possible immediate values of the vperm2f128 intrinsics.
+
+define <4 x double> @perm2pd_0x00(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x00:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc0,0x00]
+; CHECK-NEXT:    ## ymm0 = ymm0[0,1,0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x01(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x01:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $1, %ymm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc0,0x01]
+; CHECK-NEXT:    ## ymm0 = ymm0[2,3,0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x02(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x02:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x18,0xc0,0x01]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x03(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x03:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $33, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x21]
+; CHECK-NEXT:    ## ymm0 = ymm1[2,3],ymm0[0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x10(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x10:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x11(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x11:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $17, %ymm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc0,0x11]
+; CHECK-NEXT:    ## ymm0 = ymm0[2,3,2,3]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x12(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x12:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vblendpd $12, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x0d,0xc0,0x0c]
+; CHECK-NEXT:    ## ymm0 = ymm1[0,1],ymm0[2,3]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x13(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x13:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $49, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x31]
+; CHECK-NEXT:    ## ymm0 = ymm1[2,3],ymm0[2,3]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x20(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x20:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x21(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x21:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $33, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x21]
+; CHECK-NEXT:    ## ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x22(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x22:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $0, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x00]
+; CHECK-NEXT:    ## ymm0 = ymm1[0,1,0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x23(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x23:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $1, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x01]
+; CHECK-NEXT:    ## ymm0 = ymm1[2,3,0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x30(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x30:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vblendpd $12, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0d,0xc1,0x0c]
+; CHECK-NEXT:    ## ymm0 = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x31(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x31:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $49, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x31]
+; CHECK-NEXT:    ## ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x32(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0 ## encoding: [0xc5,0xfc,0x28,0xc1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x33(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x33:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $17, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x11]
+; CHECK-NEXT:    ## ymm0 = ymm1[2,3,2,3]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x double> %1
+}
+
+define <8 x float> @perm2ps_0x31(<8 x float> %a0, <8 x float> %a1) {
+; CHECK-LABEL: perm2ps_0x31:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $49, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x31]
+; CHECK-NEXT:    ## ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x float> %1
+}
+
+define <4 x i64> @perm2i_0x33(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: perm2i_0x33:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $17, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x11]
+; CHECK-NEXT:    ## ymm0 = ymm1[2,3,2,3]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x i64> %a1, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x i64> %1
+}
+
+define <4 x double> @perm2pd_0x81(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x81:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $129, %ymm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc0,0x81]
+; CHECK-NEXT:    ## ymm0 = ymm0[2,3],zero,zero
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x83:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $129, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x81]
+; CHECK-NEXT:    ## ymm0 = ymm1[2,3],zero,zero
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> %a1, <4 x double> zeroinitializer, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x28(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x28:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $40, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x28]
+; CHECK-NEXT:    ## ymm0 = zero,zero,ymm1[0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> zeroinitializer, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x08(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x08:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $40, %ymm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc0,0x28]
+; CHECK-NEXT:    ## ymm0 = zero,zero,ymm0[0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x double> zeroinitializer, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x double> %1
+}
+
+define <4 x i64> @perm2i_0x28(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: perm2i_0x28:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vperm2f128 $40, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x28]
+; CHECK-NEXT:    ## ymm0 = zero,zero,ymm1[0,1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %1 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i64> %1
+}
diff --git a/test/Transforms/InstCombine/X86/x86-vperm2.ll b/test/Transforms/InstCombine/X86/x86-vperm2.ll

deleted file mode 100644 (file)

index 84f69aa..0000000
--- a/test/Transforms/InstCombine/X86/x86-vperm2.ll
+++ /dev/null
@@ -1,313 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-; This should never happen, but make sure we don't crash handling a non-constant immediate byte.
-
-define <4 x double> @perm2pd_non_const_imm(<4 x double> %a0, <4 x double> %a1, i8 %b) {
-; CHECK-LABEL: @perm2pd_non_const_imm(
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b)
-; CHECK-NEXT:    ret <4 x double> [[RES]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b)
-  ret <4 x double> %res
-
-}
-
-
-; In the following 4 tests, both zero mask bits of the immediate are set.
-
-define <4 x double> @perm2pd_0x88(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x88(
-; CHECK-NEXT:    ret <4 x double> zeroinitializer
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 136)
-  ret <4 x double> %res
-
-}
-
-define <8 x float> @perm2ps_0x88(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: @perm2ps_0x88(
-; CHECK-NEXT:    ret <8 x float> zeroinitializer
-;
-  %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 136)
-  ret <8 x float> %res
-
-}
-
-define <8 x i32> @perm2si_0x88(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @perm2si_0x88(
-; CHECK-NEXT:    ret <8 x i32> zeroinitializer
-;
-  %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 136)
-  ret <8 x i32> %res
-
-}
-
-define <4 x i64> @perm2i_0x88(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: @perm2i_0x88(
-; CHECK-NEXT:    ret <4 x i64> zeroinitializer
-;
-  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 136)
-  ret <4 x i64> %res
-
-}
-
-
-; The other control bits are ignored when zero mask bits of the immediate are set.
-
-define <4 x double> @perm2pd_0xff(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0xff(
-; CHECK-NEXT:    ret <4 x double> zeroinitializer
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 255)
-  ret <4 x double> %res
-
-}
-
-
-; The following 16 tests are simple shuffles, except for 2 cases where we can just return one of the
-; source vectors. Verify that we generate the right shuffle masks and undef source operand where possible..
-
-define <4 x double> @perm2pd_0x00(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x00(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 0)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x01(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x01(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 1)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x02(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x02(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 2)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x03(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x03(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x10(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x10(
-; CHECK-NEXT:    ret <4 x double> %a0
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 16)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x11(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x11(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 17)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x12(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x12(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 18)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x13(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x13(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 19)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x20(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x20(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 32)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x21(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x21(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 33)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x22(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x22(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 34)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x23(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x23(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 35)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x30(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x30(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 48)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x31(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x31(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 49)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x32(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x32(
-; CHECK-NEXT:    ret <4 x double> %a1
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 50)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x33(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x33(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 51)
-  ret <4 x double> %res
-
-}
-
-; Confirm that a mask for 32-bit elements is also correct.
-
-define <8 x float> @perm2ps_0x31(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: @perm2ps_0x31(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <8 x float> [[TMP1]]
-;
-  %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 49)
-  ret <8 x float> %res
-
-}
-
-
-; Confirm that the AVX2 version works the same.
-
-define <4 x i64> @perm2i_0x33(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: @perm2i_0x33(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 51)
-  ret <4 x i64> %res
-
-}
-
-
-; Confirm that when a single zero mask bit is set, we replace a source vector with zeros.
-
-define <4 x double> @perm2pd_0x81(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x81(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 129)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x83(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 131)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x28(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x28(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 40)
-  ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x08(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x08(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x double> [[TMP1]]
-;
-  %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 8)
-  ret <4 x double> %res
-
-}
-
-; Check one more with the AVX2 version.
-
-define <4 x i64> @perm2i_0x28(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: @perm2i_0x28(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
-;
-  %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 40)
-  ret <4 x i64> %res
-
-}
-
-declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
-declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
-declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
-declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readnone
-
author	Craig Topper <craig.topper@intel.com>
	Sat, 16 Sep 2017 07:36:14 +0000 (07:36 +0000)
committer	Craig Topper <craig.topper@intel.com>
	Sat, 16 Sep 2017 07:36:14 +0000 (07:36 +0000)
include/llvm/IR/IntrinsicsX86.td		patch \| blob \| history
lib/IR/AutoUpgrade.cpp		patch \| blob \| history
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
lib/Target/X86/X86IntrinsicsInfo.h		patch \| blob \| history
lib/Transforms/InstCombine/InstCombineCalls.cpp		patch \| blob \| history
test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll		patch \| blob \| history
test/CodeGen/X86/avx-intrinsics-x86.ll		patch \| blob \| history
test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll		patch \| blob \| history
test/CodeGen/X86/avx2-intrinsics-x86.ll		patch \| blob \| history
test/CodeGen/X86/x86-vperm2.ll	[new file with mode: 0644]	patch \| blob
test/Transforms/InstCombine/X86/x86-vperm2.ll	[deleted file]	patch \| blob \| history