GCCBuiltin<"__builtin_ia32_vpermilvarps256">,
Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8i32_ty], [IntrNoMem]>;
- // TODO: Remove and autoupgrade using implementation in CGBuiltins.cpp
- def int_x86_avx_vperm2f128_pd_256 :
- Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
- llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx_vperm2f128_ps_256 :
- Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
- llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx_vperm2f128_si_256 :
- Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
- llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-
def int_x86_avx512_mask_vpermi2var_d_128 :
GCCBuiltin<"__builtin_ia32_vpermi2vard128_mask">,
Intrinsic<[llvm_v4i32_ty],
def int_x86_avx2_permps : GCCBuiltin<"__builtin_ia32_permvarsf256">,
Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8i32_ty],
[IntrNoMem]>;
- // TODO: Remove and autoupgrade using implementation in CGBuiltins.cpp
- def int_x86_avx2_vperm2i128 :
- Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
- llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
}
// Conditional load ops
Name.startswith("avx2.pcmpgt.") || // Added in 3.1
Name.startswith("avx512.mask.pcmpeq.") || // Added in 3.9
Name.startswith("avx512.mask.pcmpgt.") || // Added in 3.9
+ Name.startswith("avx.vperm2f128.") || // Added in 6.0
+ Name == "avx2.vperm2i128" || // Added in 6.0
Name == "sse.add.ss" || // Added in 4.0
Name == "sse2.add.sd" || // Added in 4.0
Name == "sse.sub.ss" || // Added in 4.0
if (CI->getNumArgOperands() == 4)
Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
CI->getArgOperand(2));
+ } else if (IsX86 && (Name.startswith("avx.vperm2f128.") ||
+ Name == "avx2.vperm2i128")) {
+ // The immediate permute control byte looks like this:
+ // [1:0] - select 128 bits from sources for low half of destination
+ // [2] - ignore
+ // [3] - zero low half of destination
+ // [5:4] - select 128 bits from sources for high half of destination
+ // [6] - ignore
+ // [7] - zero high half of destination
+
+ uint8_t Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+
+ unsigned NumElts = CI->getType()->getVectorNumElements();
+ unsigned HalfSize = NumElts / 2;
+ SmallVector<uint32_t, 8> ShuffleMask(NumElts);
+
+ // Determine which operand(s) are actually in use for this instruction.
+ Value *V0 = (Imm & 0x02) ? CI->getArgOperand(1) : CI->getArgOperand(0);
+ Value *V1 = (Imm & 0x20) ? CI->getArgOperand(1) : CI->getArgOperand(0);
+
+ // If needed, replace operands based on zero mask.
+ V0 = (Imm & 0x08) ? ConstantAggregateZero::get(CI->getType()) : V0;
+ V1 = (Imm & 0x80) ? ConstantAggregateZero::get(CI->getType()) : V1;
+
+ // Permute low half of result.
+ unsigned StartIndex = (Imm & 0x01) ? HalfSize : 0;
+ for (unsigned i = 0; i < HalfSize; ++i)
+ ShuffleMask[i] = StartIndex + i;
+
+ // Permute high half of result.
+ StartIndex = (Imm & 0x10) ? HalfSize : 0;
+ for (unsigned i = 0; i < HalfSize; ++i)
+ ShuffleMask[i + HalfSize] = NumElts + StartIndex + i;
+
+ Rep = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+
} else if (IsX86 && (Name.startswith("avx.vpermil.") ||
Name == "sse2.pshuf.d" ||
Name.startswith("avx512.mask.vpermil.p") ||
// If either input operand is a zero vector, use VPERM2X128 because its mask
// allows us to replace the zero input with an implicit zero.
if (!IsV1Zero && !IsV2Zero) {
+ // With AVX2, use VPERMQ/VPERMPD to allow memory folding.
+ if (Subtarget.hasAVX2() && V2.isUndef())
+ return SDValue();
+
// Check for patterns which can be matched with a single insert of a 128-bit
// subvector.
bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
X86_INTRINSIC_DATA(avx_rsqrt_ps_256, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
- X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
- X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
- X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
X86_INTRINSIC_DATA(avx_vpermilvar_pd, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
X86_INTRINSIC_DATA(avx_vpermilvar_ps, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
- X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
}
-/// The shuffle mask for a perm2*128 selects any two halves of two 256-bit
-/// source vectors, unless a zero bit is set. If a zero bit is set,
-/// then ignore that half of the mask and clear that half of the vector.
-static Value *simplifyX86vperm2(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
- if (!CInt)
- return nullptr;
-
- VectorType *VecTy = cast<VectorType>(II.getType());
- ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);
-
- // The immediate permute control byte looks like this:
- // [1:0] - select 128 bits from sources for low half of destination
- // [2] - ignore
- // [3] - zero low half of destination
- // [5:4] - select 128 bits from sources for high half of destination
- // [6] - ignore
- // [7] - zero high half of destination
-
- uint8_t Imm = CInt->getZExtValue();
-
- bool LowHalfZero = Imm & 0x08;
- bool HighHalfZero = Imm & 0x80;
-
- // If both zero mask bits are set, this was just a weird way to
- // generate a zero vector.
- if (LowHalfZero && HighHalfZero)
- return ZeroVector;
-
- // If 0 or 1 zero mask bits are set, this is a simple shuffle.
- unsigned NumElts = VecTy->getNumElements();
- unsigned HalfSize = NumElts / 2;
- SmallVector<uint32_t, 8> ShuffleMask(NumElts);
-
- // The high bit of the selection field chooses the 1st or 2nd operand.
- bool LowInputSelect = Imm & 0x02;
- bool HighInputSelect = Imm & 0x20;
-
- // The low bit of the selection field chooses the low or high half
- // of the selected operand.
- bool LowHalfSelect = Imm & 0x01;
- bool HighHalfSelect = Imm & 0x10;
-
- // Determine which operand(s) are actually in use for this instruction.
- Value *V0 = LowInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
- Value *V1 = HighInputSelect ? II.getArgOperand(1) : II.getArgOperand(0);
-
- // If needed, replace operands based on zero mask.
- V0 = LowHalfZero ? ZeroVector : V0;
- V1 = HighHalfZero ? ZeroVector : V1;
-
- // Permute low half of result.
- unsigned StartIndex = LowHalfSelect ? HalfSize : 0;
- for (unsigned i = 0; i < HalfSize; ++i)
- ShuffleMask[i] = StartIndex + i;
-
- // Permute high half of result.
- StartIndex = HighHalfSelect ? HalfSize : 0;
- StartIndex += NumElts;
- for (unsigned i = 0; i < HalfSize; ++i)
- ShuffleMask[i + HalfSize] = StartIndex + i;
-
- return Builder.CreateShuffleVector(V0, V1, ShuffleMask);
-}
-
/// Decode XOP integer vector comparison intrinsics.
static Value *simplifyX86vpcom(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder,
}
break;
- case Intrinsic::x86_avx_vperm2f128_pd_256:
- case Intrinsic::x86_avx_vperm2f128_ps_256:
- case Intrinsic::x86_avx_vperm2f128_si_256:
- case Intrinsic::x86_avx2_vperm2i128:
- if (Value *V = simplifyX86vperm2(*II, Builder))
- return replaceInstUsesWith(*II, V);
- break;
-
case Intrinsic::x86_avx_maskload_ps:
case Intrinsic::x86_avx_maskload_pd:
case Intrinsic::x86_avx_maskload_ps_256:
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float>, i8) nounwind readnone
+
+
+define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: test_x86_avx_vperm2f128_pd_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vperm2f128 $33, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x21]
+; CHECK-NEXT: # ymm0 = ymm1[2,3],ymm0[0,1]
+; CHECK-NEXT: retl # encoding: [0xc3]
+; X86-LABEL: test_x86_avx_vperm2f128_pd_256:
+; X86: # BB#0:
+; X86-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X86-NEXT: retl
+;
+; X64-LABEL: test_x86_avx_vperm2f128_pd_256:
+; X64: # BB#0:
+; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X64-NEXT: retq
+ %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3) ; <<4 x double>> [#uses=1]
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+
+
+define <8 x float> @test_x86_avx_vperm2f128_ps_256(<8 x float> %a0, <8 x float> %a1) {
+; CHECK-LABEL: test_x86_avx_vperm2f128_ps_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vperm2f128 $33, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x21]
+; CHECK-NEXT: # ymm0 = ymm1[2,3],ymm0[0,1]
+; CHECK-NEXT: retl # encoding: [0xc3]
+; X86-LABEL: test_x86_avx_vperm2f128_ps_256:
+; X86: # BB#0:
+; X86-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X86-NEXT: retl
+;
+; X64-LABEL: test_x86_avx_vperm2f128_ps_256:
+; X64: # BB#0:
+; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X64-NEXT: retq
+ %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 3) ; <<8 x float>> [#uses=1]
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx_vperm2f128_si_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: test_x86_avx_vperm2f128_si_256:
+; CHECK: # BB#0:
+; CHECK-NEXT: vperm2f128 $33, %ymm0, %ymm1, %ymm0 # encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x21]
+; CHECK-NEXT: # ymm0 = ymm1[2,3],ymm0[0,1]
+; CHECK-NEXT: retl # encoding: [0xc3]
+; X86-LABEL: test_x86_avx_vperm2f128_si_256:
+; X86: # BB#0:
+; X86-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X86-NEXT: retl
+;
+; X64-LABEL: test_x86_avx_vperm2f128_si_256:
+; X64: # BB#0:
+; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; X64-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 3) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
-define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_pd_256:
-; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 $3, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x03]
-; CHECK-NEXT: # ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT: retl # encoding: [0xc3]
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3) ; <<4 x double>> [#uses=1]
- ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
-
-
-define <8 x float> @test_x86_avx_vperm2f128_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_ps_256:
-; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 $3, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x03]
-; CHECK-NEXT: # ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT: retl # encoding: [0xc3]
- %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 3) ; <<8 x float>> [#uses=1]
- ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx_vperm2f128_si_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_si_256:
-; CHECK: # BB#0:
-; CHECK-NEXT: vperm2f128 $3, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x03]
-; CHECK-NEXT: # ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT: retl # encoding: [0xc3]
- %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 3) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
-
-
define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1) {
; AVX-LABEL: test_x86_avx_vpermilvar_pd:
; AVX: # BB#0:
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 | FileCheck %s
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: test_x86_avx2_pblendw:
}
declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
+
+define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) {
+; AVX2-LABEL: test_x86_avx2_vperm2i128:
+; AVX2: ## BB#0:
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: retl
+;
+; AVX512-LABEL: test_x86_avx2_vperm2i128:
+; AVX512: ## BB#0:
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512-NEXT: retl
+ %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 1) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
-define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx2_vperm2i128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vperm2f128 $1, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x01]
-; CHECK-NEXT: ## ymm0 = ymm0[2,3,0,1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
- %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 1) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
-
-
define <2 x i64> @test_x86_avx2_maskload_q(i8* %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_x86_avx2_maskload_q:
; CHECK: ## BB#0:
; AVX2: ## BB#0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI85_0, kind: FK_Data_4
-; AVX2-NEXT: vpsravd LCPI85_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI85_1, kind: FK_Data_4
+; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI84_0, kind: FK_Data_4
+; AVX2-NEXT: vpsravd LCPI84_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI84_1, kind: FK_Data_4
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vmovdqa LCPI85_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
+; AVX512VL-NEXT: vmovdqa LCPI84_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
; AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI85_0, kind: FK_Data_4
-; AVX512VL-NEXT: vpsravd LCPI85_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI85_1, kind: FK_Data_4
+; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI84_0, kind: FK_Data_4
+; AVX512VL-NEXT: vpsravd LCPI84_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI84_1, kind: FK_Data_4
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> <i32 2, i32 9, i32 -12, i32 23>, <4 x i32> <i32 1, i32 18, i32 35, i32 52>)
ret <4 x i32> %res
; AVX2: ## BB#0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
; AVX2-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI87_0, kind: FK_Data_4
-; AVX2-NEXT: vpsravd LCPI87_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI87_1, kind: FK_Data_4
+; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4
+; AVX2-NEXT: vpsravd LCPI86_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vmovdqa LCPI87_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
+; AVX512VL-NEXT: vmovdqa LCPI86_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
; AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI87_0, kind: FK_Data_4
-; AVX512VL-NEXT: vpsravd LCPI87_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI87_1, kind: FK_Data_4
+; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI86_0, kind: FK_Data_4
+; AVX512VL-NEXT: vpsravd LCPI86_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI86_1, kind: FK_Data_4
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>)
ret <8 x i32> %res
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+
+; Test cases derived from the possible immediate values of the vperm2f128 intrinsics.
+
+define <4 x double> @perm2pd_0x00(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x00:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vperm2f128 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc0,0x00]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1,0,1]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x01(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x01:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vperm2f128 $1, %ymm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc0,0x01]
+; CHECK-NEXT: ## ymm0 = ymm0[2,3,0,1]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x02(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x02:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x18,0xc0,0x01]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x03(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x03:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vperm2f128 $33, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x21]
+; CHECK-NEXT: ## ymm0 = ymm1[2,3],ymm0[0,1]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x10(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x11(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x11:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vperm2f128 $17, %ymm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc0,0x11]
+; CHECK-NEXT: ## ymm0 = ymm0[2,3,2,3]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x12(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x12:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vblendpd $12, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x0d,0xc0,0x0c]
+; CHECK-NEXT: ## ymm0 = ymm1[0,1],ymm0[2,3]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x13(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x13:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vperm2f128 $49, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x31]
+; CHECK-NEXT: ## ymm0 = ymm1[2,3],ymm0[2,3]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x20(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x20:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x21(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x21:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vperm2f128 $33, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x21]
+; CHECK-NEXT: ## ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x22(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x22:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vperm2f128 $0, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x00]
+; CHECK-NEXT: ## ymm0 = ymm1[0,1,0,1]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a1, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x23(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x23:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vperm2f128 $1, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x01]
+; CHECK-NEXT: ## ymm0 = ymm1[2,3,0,1]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a1, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x30(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x30:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vblendpd $12, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0d,0xc1,0x0c]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x31(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x31:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vperm2f128 $49, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x31]
+; CHECK-NEXT: ## ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x32(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## encoding: [0xc5,0xfc,0x28,0xc1]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a1, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x33(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x33:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vperm2f128 $17, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x11]
+; CHECK-NEXT: ## ymm0 = ymm1[2,3,2,3]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a1, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x double> %1
+}
+
+define <8 x float> @perm2ps_0x31(<8 x float> %a0, <8 x float> %a1) {
+; CHECK-LABEL: perm2ps_0x31:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vperm2f128 $49, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x31]
+; CHECK-NEXT: ## ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ ret <8 x float> %1
+}
+
+define <4 x i64> @perm2i_0x33(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: perm2i_0x33:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vperm2f128 $17, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x11]
+; CHECK-NEXT: ## ymm0 = ymm1[2,3,2,3]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x i64> %a1, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x i64> %1
+}
+
+define <4 x double> @perm2pd_0x81(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x81:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vperm2f128 $129, %ymm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc0,0x81]
+; CHECK-NEXT: ## ymm0 = ymm0[2,3],zero,zero
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x83:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vperm2f128 $129, %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe3,0x75,0x06,0xc0,0x81]
+; CHECK-NEXT: ## ymm0 = ymm1[2,3],zero,zero
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> %a1, <4 x double> zeroinitializer, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x28(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x28:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vperm2f128 $40, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x28]
+; CHECK-NEXT: ## ymm0 = zero,zero,ymm1[0,1]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> zeroinitializer, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x double> %1
+}
+
+define <4 x double> @perm2pd_0x08(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: perm2pd_0x08:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vperm2f128 $40, %ymm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc0,0x28]
+; CHECK-NEXT: ## ymm0 = zero,zero,ymm0[0,1]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x double> zeroinitializer, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x double> %1
+}
+
+define <4 x i64> @perm2i_0x28(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: perm2i_0x28:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vperm2f128 $40, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x28]
+; CHECK-NEXT: ## ymm0 = zero,zero,ymm1[0,1]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x i64> %1
+}
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-; This should never happen, but make sure we don't crash handling a non-constant immediate byte.
-
-define <4 x double> @perm2pd_non_const_imm(<4 x double> %a0, <4 x double> %a1, i8 %b) {
-; CHECK-LABEL: @perm2pd_non_const_imm(
-; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b)
-; CHECK-NEXT: ret <4 x double> [[RES]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %b)
- ret <4 x double> %res
-
-}
-
-
-; In the following 4 tests, both zero mask bits of the immediate are set.
-
-define <4 x double> @perm2pd_0x88(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x88(
-; CHECK-NEXT: ret <4 x double> zeroinitializer
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 136)
- ret <4 x double> %res
-
-}
-
-define <8 x float> @perm2ps_0x88(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: @perm2ps_0x88(
-; CHECK-NEXT: ret <8 x float> zeroinitializer
-;
- %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 136)
- ret <8 x float> %res
-
-}
-
-define <8 x i32> @perm2si_0x88(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: @perm2si_0x88(
-; CHECK-NEXT: ret <8 x i32> zeroinitializer
-;
- %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 136)
- ret <8 x i32> %res
-
-}
-
-define <4 x i64> @perm2i_0x88(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: @perm2i_0x88(
-; CHECK-NEXT: ret <4 x i64> zeroinitializer
-;
- %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 136)
- ret <4 x i64> %res
-
-}
-
-
-; The other control bits are ignored when zero mask bits of the immediate are set.
-
-define <4 x double> @perm2pd_0xff(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0xff(
-; CHECK-NEXT: ret <4 x double> zeroinitializer
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 255)
- ret <4 x double> %res
-
-}
-
-
-; The following 16 tests are simple shuffles, except for 2 cases where we can just return one of the
-; source vectors. Verify that we generate the right shuffle masks and undef source operand where possible..
-
-define <4 x double> @perm2pd_0x00(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x00(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 0)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x01(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x01(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 1)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x02(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x02(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 2)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x03(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x03(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x10(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x10(
-; CHECK-NEXT: ret <4 x double> %a0
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 16)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x11(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x11(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 17)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x12(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x12(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 18)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x13(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x13(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> %a0, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 19)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x20(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x20(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 32)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x21(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x21(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 33)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x22(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x22(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 34)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x23(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x23(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 35)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x30(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x30(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 48)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x31(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x31(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 49)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x32(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x32(
-; CHECK-NEXT: ret <4 x double> %a1
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 50)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x33(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x33(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 51)
- ret <4 x double> %res
-
-}
-
-; Confirm that a mask for 32-bit elements is also correct.
-
-define <8 x float> @perm2ps_0x31(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: @perm2ps_0x31(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: ret <8 x float> [[TMP1]]
-;
- %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 49)
- ret <8 x float> %res
-
-}
-
-
-; Confirm that the AVX2 version works the same.
-
-define <4 x i64> @perm2i_0x33(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: @perm2i_0x33(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT: ret <4 x i64> [[TMP1]]
-;
- %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 51)
- ret <4 x i64> %res
-
-}
-
-
-; Confirm that when a single zero mask bit is set, we replace a source vector with zeros.
-
-define <4 x double> @perm2pd_0x81(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x81(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> %a0, <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 129)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x83(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x83(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> %a1, <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 131)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x28(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x28(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 40)
- ret <4 x double> %res
-
-}
-
-define <4 x double> @perm2pd_0x08(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: @perm2pd_0x08(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> <double 0.000000e+00, double 0.000000e+00, double undef, double undef>, <4 x double> %a0, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: ret <4 x double> [[TMP1]]
-;
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 8)
- ret <4 x double> %res
-
-}
-
-; Check one more with the AVX2 version.
-
-define <4 x i64> @perm2i_0x28(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: @perm2i_0x28(
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: ret <4 x i64> [[TMP1]]
-;
- %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 40)
- ret <4 x i64> %res
-
-}
-
-declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
-declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
-declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
-declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readnone
-