From 028edca1226c5740c5e10f7b59be0e0906a0f3a6 Mon Sep 17 00:00:00 2001 From: Jina Nahias Date: Mon, 6 Nov 2017 07:09:24 +0000 Subject: [PATCH] [x86][AVX512] Lowering Broadcastm intrinsics to LLVM IR This patch, together with a matching clang patch (https://reviews.llvm.org/D38683), implements the lowering of X86 broadcastm intrinsics to IR. Differential Revision: https://reviews.llvm.org/D38684 Change-Id: I709ac0b34641095397e994c8ff7e15d1315b3540 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317458 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/IR/AutoUpgrade.cpp | 11 ++- lib/Target/X86/X86ISelLowering.cpp | 28 ++++--- lib/Target/X86/X86IntrinsicsInfo.h | 6 -- .../X86/avx512cd-intrinsics-fast-isel.ll | 37 +++++++++ .../X86/avx512cd-intrinsics-upgrade.ll | 23 ++++++ test/CodeGen/X86/avx512cd-intrinsics.ll | 22 ------ .../X86/avx512cdvl-intrinsics-upgrade.ll | 44 +++++++++++ test/CodeGen/X86/avx512cdvl-intrinsics.ll | 43 ----------- .../X86/avx512vlcd-intrinsics-fast-isel.ll | 75 +++++++++++++++++++ test/CodeGen/X86/broadcastm-lowering.ll | 7 +- 10 files changed, 208 insertions(+), 88 deletions(-) create mode 100644 test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll create mode 100644 test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index 07d499bc193..2c9e9be3da5 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -78,6 +78,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name=="ssse3.pabs.d.128" || // Added in 6.0 Name.startswith("avx2.pabs.") || // Added in 6.0 Name.startswith("avx512.mask.pabs.") || // Added in 6.0 + Name.startswith("avx512.broadcastm") || // Added in 6.0 Name.startswith("avx512.mask.pbroadcast") || // Added in 6.0 Name.startswith("sse2.pcmpeq.") || // Added in 3.1 Name.startswith("sse2.pcmpgt.") || // Added in 3.1 @@ -1027,7 +1028,15 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = Builder.CreateICmp(CmpEq ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_SGT, CI->getArgOperand(0), CI->getArgOperand(1)); Rep = Builder.CreateSExt(Rep, CI->getType(), ""); - } else if (IsX86 && (Name.startswith("avx512.mask.pbroadcast"))){ + } else if (IsX86 && (Name.startswith("avx512.broadcastm"))) { + Type *ExtTy = Type::getInt32Ty(C); + if (CI->getOperand(0)->getType()->isIntegerTy(8)) + ExtTy = Type::getInt64Ty(C); + unsigned NumElts = CI->getType()->getPrimitiveSizeInBits() / + ExtTy->getPrimitiveSizeInBits(); + Rep = Builder.CreateZExt(CI->getArgOperand(0), ExtTy); + Rep = Builder.CreateVectorSplat(NumElts, Rep); + } else if (IsX86 && (Name.startswith("avx512.mask.pbroadcast"))) { unsigned NumElts = CI->getArgOperand(1)->getType()->getVectorNumElements(); Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0)); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1a1fd2518b0..357b828d6d2 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -6746,6 +6746,9 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); + BitVector UndefElements; + SDValue Ld = BVOp->getSplatValue(&UndefElements); + // Attempt to use VBROADCASTM // From this paterrn: // a. t0 = (zext_i64 (bitcast_i8 v2i1 X)) @@ -6753,17 +6756,23 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, // // Create (VBROADCASTM v2i1 X) if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) { - MVT EltType; - unsigned NumElts; + MVT EltType = VT.getScalarType(); + unsigned NumElts = VT.getVectorNumElements(); + SDValue BOperand; SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType); - if (ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) { - SDValue BOperand = ZeroExtended.getOperand(0); + if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) || + (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND && + Ld.getOperand(0).getOpcode() == ISD::BITCAST)) { + if (ZeroExtended) + BOperand = ZeroExtended.getOperand(0); + else + BOperand = Ld.getOperand(0).getOperand(0); if (BOperand.getValueType().isVector() && BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) { - if ((EltType == MVT::i64 && - VT.getVectorElementType() == MVT::i8) || // for broadcastmb2q - (EltType == MVT::i32 && - VT.getVectorElementType() == MVT::i16)) { // for broadcastmw2d + if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 || + NumElts == 8)) || // for broadcastmb2q + (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 || + NumElts == 16))) { // for broadcastmw2d SDValue Brdcst = DAG.getNode(X86ISD::VBROADCASTM, dl, MVT::getVectorVT(EltType, NumElts), BOperand); @@ -6773,9 +6782,6 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, } } - BitVector UndefElements; - SDValue Ld = BVOp->getSplatValue(&UndefElements); - // We need a splat of a single value to use broadcast, and it doesn't // make any sense if the value is only in one element of the vector. if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) { diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 128d0942a73..e8da3134757 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -422,12 +422,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0), X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0), - X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0), - X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0), - X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0), - X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0), - X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0), X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), diff --git a/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll new file mode 100644 index 00000000000..ca5e5523a9d --- /dev/null +++ b/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512cd | FileCheck %s + +define <8 x i64> @test_mm512_broadcastmb_epi64(<8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: test_mm512_broadcastmb_epi64: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpbroadcastmb2q %k0, %zmm0 +; CHECK-NEXT: retq +entry: + %0 = icmp eq <8 x i64> %a, %b + %1 = bitcast <8 x i1> %0 to i8 + %conv.i = zext i8 %1 to i64 + %vecinit.i.i = insertelement <8 x i64> undef, i64 %conv.i, i32 0 + %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer + ret <8 x i64> %vecinit7.i.i +} + +define <8 x i64> @test_mm512_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: test_mm512_broadcastmw_epi32: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpbroadcastmw2d %k0, %zmm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %a to <16 x i32> + %1 = bitcast <8 x i64> %b to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast <16 x i1> %2 to i16 + %conv.i = zext i16 %3 to i32 + %vecinit.i.i = insertelement <16 x i32> undef, i32 %conv.i, i32 0 + %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer + %4 = bitcast <16 x i32> %vecinit15.i.i to <8 x i64> + ret <8 x i64> %4 +} + + diff --git a/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll index e5dbff9ac51..92dfe1e087a 100644 --- a/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll @@ -45,3 +45,26 @@ define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) ret <8 x i64> %res } + +define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) { +; CHECK-LABEL: test_x86_vbroadcastmw_512: +; CHECK: ## BB#0: +; CHECK-NEXT: movzwl %di, %eax +; CHECK-NEXT: vpbroadcastd %eax, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0) + ret <16 x i32> %res +} +declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16) + +define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) { +; CHECK-LABEL: test_x86_broadcastmb_512: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: vpbroadcastq %rax, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0) + ret <8 x i64> %res +} +declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8) + diff --git a/test/CodeGen/X86/avx512cd-intrinsics.ll b/test/CodeGen/X86/avx512cd-intrinsics.ll index 7e5a3e8fe25..ab8c80f8dd3 100644 --- a/test/CodeGen/X86/avx512cd-intrinsics.ll +++ b/test/CodeGen/X86/avx512cd-intrinsics.ll @@ -1,28 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s -define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) { -; CHECK-LABEL: test_x86_vbroadcastmw_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k0 -; CHECK-NEXT: vpbroadcastmw2d %k0, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0) - ret <16 x i32> %res -} -declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16) - -define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) { -; CHECK-LABEL: test_x86_broadcastmb_512: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k0 -; CHECK-NEXT: vpbroadcastmb2q %k0, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0) - ret <8 x i64> %res -} -declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8) - declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly define <8 x i64> @test_conflict_q(<8 x i64> %a) { diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll index f8f47c87100..0e310be3489 100644 --- a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll @@ -69,3 +69,47 @@ define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> ret <4 x i64> %res2 } +define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) { +; CHECK-LABEL: test_x86_vbroadcastmw_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzwl %di, %eax +; CHECK-NEXT: vpbroadcastd %eax, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ; + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16) + +define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) { +; CHECK-LABEL: test_x86_vbroadcastmw_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzwl %di, %eax +; CHECK-NEXT: vpbroadcastd %eax, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ; + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16) + +define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) { +; CHECK-LABEL: test_x86_broadcastmb_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: vpbroadcastq %rax, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ; + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8) + +define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) { +; CHECK-LABEL: test_x86_broadcastmb_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: vpbroadcastq %rax, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ; + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8) + diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll index 96254f7c95b..2fb50297c62 100644 --- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll @@ -147,46 +147,3 @@ define <4 x i64>@test_int_x86_avx512_mask_vpconflict_q_256(<4 x i64> %x0, <4 x i ret <4 x i64> %res2 } -define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) { -; CHECK-LABEL: test_x86_vbroadcastmw_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k0 -; CHECK-NEXT: vpbroadcastmw2d %k0, %ymm0 -; CHECK-NEXT: retq - %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ; - ret <8 x i32> %res -} -declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16) - -define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) { -; CHECK-LABEL: test_x86_vbroadcastmw_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k0 -; CHECK-NEXT: vpbroadcastmw2d %k0, %xmm0 -; CHECK-NEXT: retq - %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ; - ret <4 x i32> %res -} -declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16) - -define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) { -; CHECK-LABEL: test_x86_broadcastmb_256: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k0 -; CHECK-NEXT: vpbroadcastmb2q %k0, %ymm0 -; CHECK-NEXT: retq - %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ; - ret <4 x i64> %res -} -declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8) - -define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) { -; CHECK-LABEL: test_x86_broadcastmb_128: -; CHECK: ## BB#0: -; CHECK-NEXT: kmovw %edi, %k0 -; CHECK-NEXT: vpbroadcastmb2q %k0, %xmm0 -; CHECK-NEXT: retq - %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ; - ret <2 x i64> %res -} -declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8) diff --git a/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll new file mode 100644 index 00000000000..ab4cbeb8d5e --- /dev/null +++ b/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s + +define <2 x i64> @test_mm_broadcastmb_epi64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_mm_broadcastmb_epi64: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 +; CHECK-NEXT: vpbroadcastmb2q %k0, %xmm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %a to <4 x i32> + %1 = bitcast <2 x i64> %b to <4 x i32> + %2 = icmp eq <4 x i32> %0, %1 + %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> + %4 = bitcast <8 x i1> %3 to i8 + %conv.i = zext i8 %4 to i64 + %vecinit.i.i = insertelement <2 x i64> undef, i64 %conv.i, i32 0 + %vecinit1.i.i = shufflevector <2 x i64> %vecinit.i.i, <2 x i64> undef, <2 x i32> zeroinitializer + ret <2 x i64> %vecinit1.i.i +} + +define <4 x i64> @test_mm256_broadcastmb_epi64(<4 x i64> %a, <4 x i64> %b) { +; CHECK-LABEL: test_mm256_broadcastmb_epi64: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 +; CHECK-NEXT: vpbroadcastmb2q %k0, %ymm0 +; CHECK-NEXT: retq +entry: + %0 = icmp eq <4 x i64> %a, %b + %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> + %2 = bitcast <8 x i1> %1 to i8 + %conv.i = zext i8 %2 to i64 + %vecinit.i.i = insertelement <4 x i64> undef, i64 %conv.i, i32 0 + %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer + ret <4 x i64> %vecinit3.i.i +} + +define <2 x i64> @test_mm_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: test_mm_broadcastmw_epi32: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpbroadcastmw2d %k0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %a to <16 x i32> + %1 = bitcast <8 x i64> %b to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast <16 x i1> %2 to i16 + %conv.i = zext i16 %3 to i32 + %vecinit.i.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0 + %vecinit3.i.i = shufflevector <4 x i32> %vecinit.i.i, <4 x i32> undef, <4 x i32> zeroinitializer + %4 = bitcast <4 x i32> %vecinit3.i.i to <2 x i64> + ret <2 x i64> %4 +} + +define <4 x i64> @test_mm256_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: test_mm256_broadcastmw_epi32: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpbroadcastmw2d %k0, %ymm0 +; CHECK-NEXT: retq +entry: + %0 = bitcast <8 x i64> %a to <16 x i32> + %1 = bitcast <8 x i64> %b to <16 x i32> + %2 = icmp eq <16 x i32> %0, %1 + %3 = bitcast <16 x i1> %2 to i16 + %conv.i = zext i16 %3 to i32 + %vecinit.i.i = insertelement <8 x i32> undef, i32 %conv.i, i32 0 + %vecinit7.i.i = shufflevector <8 x i32> %vecinit.i.i, <8 x i32> undef, <8 x i32> zeroinitializer + %4 = bitcast <8 x i32> %vecinit7.i.i to <4 x i64> + ret <4 x i64> %4 +} + + diff --git a/test/CodeGen/X86/broadcastm-lowering.ll b/test/CodeGen/X86/broadcastm-lowering.ll index 2a8236cf093..fc7b192c2f8 100644 --- a/test/CodeGen/X86/broadcastm-lowering.ll +++ b/test/CodeGen/X86/broadcastm-lowering.ll @@ -80,8 +80,7 @@ define <16 x i32> @test_mm512_epi32(<16 x i32> %a, <16 x i32> %b) { ; AVX512CD-LABEL: test_mm512_epi32: ; AVX512CD: # BB#0: # %entry ; AVX512CD-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; AVX512CD-NEXT: kmovw %k0, %eax -; AVX512CD-NEXT: vpbroadcastd %eax, %zmm0 +; AVX512CD-NEXT: vpbroadcastmw2d %k0, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512VLCDBW-LABEL: test_mm512_epi32: @@ -110,9 +109,7 @@ define <8 x i64> @test_mm512_epi64(<8 x i32> %a, <8 x i32> %b) { ; AVX512CD-NEXT: # kill: %YMM1 %YMM1 %ZMM1 ; AVX512CD-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; AVX512CD-NEXT: kmovw %k0, %eax -; AVX512CD-NEXT: movzbl %al, %eax -; AVX512CD-NEXT: vpbroadcastq %rax, %zmm0 +; AVX512CD-NEXT: vpbroadcastmb2q %k0, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512VLCDBW-LABEL: test_mm512_epi64: -- 2.40.0