From 61fe4ef60e3668bf148382478f6a8039e55622a0 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 1 Oct 2019 16:28:20 +0000 Subject: [PATCH] [X86] Add a VBROADCAST_LOAD ISD opcode representing a scalar load broadcasted to a vector. Summary: This adds the ISD opcode and a DAG combine to create it. There are probably some places where we can directly create it, but I'll leave that for future work. This updates all of the isel patterns to look for this new node. I had to add a few additional isel patterns for aligned extloads which we should probably fix with a DAG combine or something. This does mean that the broadcast load folding for avx512 can no longer match a broadcasted aligned extload. There's still some work to do here for combining a broadcast of a broadcast_load. We also need to improve extractelement or demanded vector elements of a broadcast_load. I'll try to get those done before I submit this patch. Reviewers: RKSimon, spatel Reviewed By: RKSimon Subscribers: hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D68198 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373349 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelDAGToDAG.cpp | 43 ++- lib/Target/X86/X86ISelLowering.cpp | 115 ++++++- lib/Target/X86/X86ISelLowering.h | 3 + lib/Target/X86/X86InstrAVX512.td | 288 +++++++++--------- lib/Target/X86/X86InstrFragmentsSIMD.td | 22 ++ lib/Target/X86/X86InstrSSE.td | 46 +-- test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 7 +- .../X86/avx512bw-intrinsics-upgrade.ll | 14 +- .../X86/avx512bwvl-intrinsics-upgrade.ll | 32 +- .../X86/avx512vl-intrinsics-upgrade.ll | 22 +- .../X86/bitcast-int-to-vector-bool-zext.ll | 28 +- 11 files changed, 382 insertions(+), 238 deletions(-) diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 7cdb1db6a77..76d585855b8 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -253,6 +253,11 @@ namespace { return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment); } + bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment); + /// Implement addressing mode selection for inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, @@ -2592,6 +2597,20 @@ bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, N.getOperand(1), Base, Scale, Index, Disp, Segment); } +bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment) { + assert(Root && P && "Unknown root/parent nodes"); + if (N->getOpcode() != X86ISD::VBROADCAST_LOAD || + !IsProfitableToFold(N, P, Root) || + !IsLegalToFold(N, P, Root, OptLevel)) + return false; + + return selectAddr(N.getNode(), + N.getOperand(1), Base, Scale, Index, Disp, Segment); +} + /// Return an SDNode that returns the value of the global base register. /// Output instructions required to initialize the global base register, /// if necessary. @@ -4234,13 +4253,14 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) { // Look through single use bitcasts. - if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) - Src = Src.getOperand(0); - - if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) { + if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) { Parent = Src.getNode(); Src = Src.getOperand(0); - if (Src.getSimpleValueType() == CmpSVT) + } + + if (Src.getOpcode() == X86ISD::VBROADCAST_LOAD && Src.hasOneUse()) { + auto *MemIntr = cast(Src); + if (MemIntr->getMemoryVT().getSizeInBits() == CmpSVT.getSizeInBits()) return Src; } @@ -4252,17 +4272,18 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, bool FoldedBCast = false; if (!FoldedLoad && CanFoldLoads && (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) { - SDNode *ParentNode = nullptr; + SDNode *ParentNode = N0.getNode(); if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) { - FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0, - Tmp1, Tmp2, Tmp3, Tmp4); + FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0, + Tmp1, Tmp2, Tmp3, Tmp4); } // Try the other operand. if (!FoldedBCast) { + SDNode *ParentNode = N0.getNode(); if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) { - FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0, - Tmp1, Tmp2, Tmp3, Tmp4); + FoldedBCast = tryFoldBroadcast(Root, ParentNode, Load, Tmp0, + Tmp1, Tmp2, Tmp3, Tmp4); if (FoldedBCast) std::swap(Src0, Src1); } @@ -4332,7 +4353,7 @@ bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, // Update the chain. ReplaceUses(Load.getValue(1), SDValue(CNode, 1)); // Record the mem-refs - CurDAG->setNodeMemRefs(CNode, {cast(Load)->getMemOperand()}); + CurDAG->setNodeMemRefs(CNode, {cast(Load)->getMemOperand()}); } else { if (IsMasked) CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f79f7b70a9d..58398df2059 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -6130,6 +6130,37 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, } } + if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD && + EltSizeInBits <= VT.getScalarSizeInBits()) { + auto *MemIntr = cast(Op); + if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits()) + return false; + + SDValue Ptr = MemIntr->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr->getOperand(0); + + auto *CNode = dyn_cast(Ptr); + if (!CNode || CNode->isMachineConstantPoolEntry() || + CNode->getOffset() != 0) + return false; + + if (const Constant *C = CNode->getConstVal()) { + unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits(); + unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; + + APInt UndefSrcElts(NumSrcElts, 0); + SmallVector SrcEltBits(1, APInt(SrcEltSizeInBits, 0)); + if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) { + if (UndefSrcElts[0]) + UndefSrcElts.setBits(0, NumSrcElts); + SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]); + return CastBitData(UndefSrcElts, SrcEltBits); + } + } + } + // Extract constant bits from a subvector broadcast. if (Op.getOpcode() == X86ISD::SUBV_BROADCAST) { SmallVector SubEltBits; @@ -28582,6 +28613,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; + case X86ISD::VBROADCAST_LOAD: return "X86ISD::VBROADCAST_LOAD"; case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; @@ -33347,6 +33379,19 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0)); + // vbroadcast(scalarload X) -> vbroadcast_load X + if (!SrcVT.isVector() && Src.hasOneUse() && + ISD::isNormalLoad(Src.getNode())) { + LoadSDNode *LN = cast(Src); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; + SDValue BcastLd = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, + LN->getMemoryVT(), LN->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); + return BcastLd; + } + // Share broadcast with the longest vector and extract low subvector (free). for (SDNode *User : Src->uses()) if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST && @@ -33512,17 +33557,23 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); } - // If we're inserting an element from a vbroadcast of a load, fold the + // If we're inserting an element from a vbroadcast load, fold the // load into the X86insertps instruction. We need to convert the scalar // load to a vector and clear the source lane of the INSERTPS control. - if (Op1.getOpcode() == X86ISD::VBROADCAST && Op1.hasOneUse() && - Op1.getOperand(0).hasOneUse() && - !Op1.getOperand(0).getValueType().isVector() && - ISD::isNormalLoad(Op1.getOperand(0).getNode())) - return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, - DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, - Op1.getOperand(0)), - DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8)); + if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) { + auto *MemIntr = cast(Op1); + if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) { + SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(), + MemIntr->getBasePtr(), + MemIntr->getMemOperand()); + SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, + Load), + DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8)); + DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1)); + return Insert; + } + } return SDValue(); } @@ -35851,6 +35902,23 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, return DAG.getBitcast(VT, SrcOp); } + // If we're extracting a single element from a broadcast load and there are + // no other users, just create a single load. + if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) { + auto *MemIntr = cast(SrcBC); + unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits(); + if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth && + VT.getSizeInBits() == SrcBCWidth) { + SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(), + MemIntr->getBasePtr(), + MemIntr->getPointerInfo(), + MemIntr->getAlignment(), + MemIntr->getMemOperand()->getFlags()); + DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1)); + return Load; + } + } + // Handle extract(truncate(x)) for 0'th index. // TODO: Treat this as a faux shuffle? // TODO: When can we use this for general indices? @@ -43893,6 +43961,21 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST) return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0)); + // If this is a broadcast load inserted into an upper undef, use a larger + // broadcast load. + if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() && + SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) { + auto *MemIntr = cast(SubVec); + SDVTList Tys = DAG.getVTList(OpVT, MVT::Other); + SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() }; + SDValue BcastLd = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, + MemIntr->getMemoryVT(), + MemIntr->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1)); + return BcastLd; + } + return SDValue(); } @@ -44065,6 +44148,20 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, InVec.getOperand(0).getValueSizeInBits() <= VT.getSizeInBits()) return DAG.getNode(X86ISD::VBROADCAST, SDLoc(N), VT, InVec.getOperand(0)); + if (InVec.getOpcode() == X86ISD::VBROADCAST_LOAD && InVec.hasOneUse()) { + auto *MemIntr = cast(InVec); + if (MemIntr->getMemoryVT().getSizeInBits() <= VT.getSizeInBits()) { + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() }; + SDValue BcastLd = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, + MemIntr->getMemoryVT(), + MemIntr->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1)); + return BcastLd; + } + } + // If we're extracting the lowest subvector and we're the only user, // we may be able to perform this with a smaller vector width. if (IdxVal == 0 && InVec.hasOneUse()) { diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 7aeb6669b29..5c967ca1eca 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -615,6 +615,9 @@ namespace llvm { // extract_vector_elt, store. VEXTRACT_STORE, + // scalar broadcast from memory + VBROADCAST_LOAD, + // Store FP control world into i16 memory. FNSTCW16m, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 2cf5d46d095..4064d020cc4 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -74,6 +74,7 @@ class X86VectorVTInfo("alignedload" # VTName); PatFrag ScalarLdFrag = !cast("load" # EltVT); + PatFrag BroadcastLdFrag = !cast("X86VBroadcastld" # EltSizeName); ComplexPattern ScalarIntMemCPat = !if (!eq (EltTypeName, "f32"), !cast("sse_load_f32"), @@ -1124,7 +1125,8 @@ multiclass avx512_broadcast_rm_split opc, string OpcodeStr, X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo, bit IsConvertibleToThreeAddress, - SDPatternOperator UnmaskedOp = X86VBroadcast> { + SDPatternOperator UnmaskedOp = X86VBroadcast, + SDPatternOperator UnmaskedBcastOp = SrcInfo.BroadcastLdFrag> { let hasSideEffects = 0 in def r : AVX512PI opc, string OpcodeStr, (MaskInfo.VT (bitconvert (DestInfo.VT - (UnmaskedOp (SrcInfo.ScalarLdFrag addr:$src))))))], + (UnmaskedBcastOp addr:$src)))))], DestInfo.ExeDomain>, T8PD, EVEX, EVEX_CD8, Sched<[SchedRM]>; @@ -1182,7 +1184,7 @@ multiclass avx512_broadcast_rm_split opc, string OpcodeStr, (MaskInfo.VT (bitconvert (DestInfo.VT - (X86VBroadcast (SrcInfo.ScalarLdFrag addr:$src))))), + (SrcInfo.BroadcastLdFrag addr:$src)))), MaskInfo.ImmAllZerosV))], DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, EVEX_CD8, Sched<[SchedRM]>; @@ -1199,7 +1201,7 @@ multiclass avx512_broadcast_rm_split opc, string OpcodeStr, (MaskInfo.VT (bitconvert (DestInfo.VT - (X86VBroadcast (SrcInfo.ScalarLdFrag addr:$src))))), + (SrcInfo.BroadcastLdFrag addr:$src)))), MaskInfo.RC:$src0))], DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, EVEX_CD8, Sched<[SchedRM]>; @@ -1394,6 +1396,10 @@ let Predicates = [HasAVX512] in { // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQZm addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v16i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDZm addr:$src)>; } let Predicates = [HasVLX] in { @@ -1402,6 +1408,12 @@ let Predicates = [HasVLX] in { (VPBROADCASTQZ128m addr:$src)>; def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQZ256m addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDZ128m addr:$src)>; + def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDZ256m addr:$src)>; } let Predicates = [HasVLX, HasBWI] in { // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. @@ -1422,6 +1434,12 @@ let Predicates = [HasVLX, HasBWI] in { def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWZ256m addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWZ128m addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWZ256m addr:$src)>; } let Predicates = [HasBWI] in { // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. @@ -1434,6 +1452,10 @@ let Predicates = [HasBWI] in { def : Pat<(v32i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWZm addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v32i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWZm addr:$src)>; } //===----------------------------------------------------------------------===// @@ -1669,12 +1691,12 @@ multiclass avx512_common_broadcast_32x2 opc, string OpcodeStr, let Predicates = [HasDQI] in defm Z : avx512_broadcast_rm_split, + _Src.info512, _Src.info128, 0, null_frag, null_frag>, EVEX_V512; let Predicates = [HasDQI, HasVLX] in defm Z256 : avx512_broadcast_rm_split, + _Src.info256, _Src.info128, 0, null_frag, null_frag>, EVEX_V256; } @@ -1685,7 +1707,7 @@ multiclass avx512_common_broadcast_i32x2 opc, string OpcodeStr, let Predicates = [HasDQI, HasVLX] in defm Z128 : avx512_broadcast_rm_split, + _Src.info128, _Src.info128, 0, null_frag, null_frag>, EVEX_V128; } @@ -1753,7 +1775,7 @@ multiclass avx512_perm_i_mb opc, string OpcodeStr, OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (_.VT (X86VPermt2 _.RC:$src2, - IdxVT.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>, + IdxVT.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>, AVX5128IBase, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -1830,7 +1852,7 @@ multiclass avx512_perm_i_lowering(InstrStr#"rmbk") _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3)>; @@ -1869,7 +1891,7 @@ multiclass avx512_perm_t_mb opc, string OpcodeStr, OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (_.VT (X86VPermt2 _.RC:$src1, - IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>, + IdxVT.RC:$src2,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>, AVX5128IBase, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -2166,7 +2188,7 @@ multiclass avx512_icmp_packed_rmb opc, string OpcodeStr, PatFrag OpNode, !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst", "|$dst, $src1, ${src2}", _.BroadcastStr, "}"), [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), - (X86VBroadcast (_.ScalarLdFrag addr:$src2))))]>, + (_.BroadcastLdFrag addr:$src2)))]>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmbk : AVX512BI opc, string OpcodeStr, PatFrag OpNode, "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, (OpNode_su (_.VT _.RC:$src1), - (X86VBroadcast - (_.ScalarLdFrag addr:$src2)))))]>, + (_.BroadcastLdFrag addr:$src2))))]>, EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -2345,8 +2366,7 @@ multiclass avx512_icmp_cc_rmb opc, string Suffix, PatFrag Frag, "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), [(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1), - (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), + (_.BroadcastLdFrag addr:$src2), cond)))]>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmibk : AVX512AIi8 opc, string Suffix, PatFrag Frag, [(set _.KRC:$dst, (and _.KRCWM:$mask, (_.KVT (Frag_su:$cc (_.VT _.RC:$src1), - (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), + (_.BroadcastLdFrag addr:$src2), cond))))]>, EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; - def : Pat<(_.KVT (CommFrag:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + def : Pat<(_.KVT (CommFrag:$cc (_.BroadcastLdFrag addr:$src2), (_.VT _.RC:$src1), cond)), (!cast(Name#_.ZSuffix#"rmib") _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>; def : Pat<(and _.KRCWM:$mask, - (_.KVT (CommFrag_su:$cc (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), + (_.KVT (CommFrag_su:$cc (_.BroadcastLdFrag addr:$src2), (_.VT _.RC:$src1), cond))), (!cast(Name#_.ZSuffix#"rmibk") _.KRCWM:$mask, _.RC:$src1, addr:$src2, @@ -2551,10 +2569,10 @@ multiclass avx512_vcmp_common, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -2571,13 +2589,12 @@ multiclass avx512_vcmp_common; - def : Pat<(X86cmpm (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + def : Pat<(X86cmpm (_.BroadcastLdFrag addr:$src2), (_.VT _.RC:$src1), timm:$cc), (!cast(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>; - def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), + def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.BroadcastLdFrag addr:$src2), (_.VT _.RC:$src1), timm:$cc)), (!cast(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, @@ -2721,8 +2738,7 @@ multiclass avx512_vector_fpclass opc, string OpcodeStr, _.BroadcastStr##", $dst|$dst, ${src1}" ##_.BroadcastStr##", $src2}", [(set _.KRC:$dst,(X86Vfpclass - (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src1))), + (_.VT (_.BroadcastLdFrag addr:$src1)), (i32 timm:$src2)))]>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmbk : AVX512 opc, string OpcodeStr, _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"## _.BroadcastStr##", $src2}", [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclass_su - (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src1))), + (_.VT (_.BroadcastLdFrag addr:$src1)), (i32 timm:$src2))))]>, EVEX_B, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -4589,8 +4604,7 @@ multiclass avx512_binop_rmb opc, string OpcodeStr, SDNode OpNode, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src1, - (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))>, + (_.BroadcastLdFrag addr:$src2)))>, AVX512BIBase, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -4716,8 +4730,7 @@ multiclass avx512_binop_rm2 opc, string OpcodeStr, "${src2}"##_Brdct.BroadcastStr##", $src1", "$src1, ${src2}"##_Brdct.BroadcastStr, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert - (_Brdct.VT (X86VBroadcast - (_Brdct.ScalarLdFrag addr:$src2))))))>, + (_Brdct.VT (_Brdct.BroadcastLdFrag addr:$src2)))))>, AVX512BIBase, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -4789,8 +4802,7 @@ multiclass avx512_packs_rmb opc, string OpcodeStr, SDNode OpNode, "${src2}"##_Src.BroadcastStr##", $src1", "$src1, ${src2}"##_Src.BroadcastStr, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert - (_Src.VT (X86VBroadcast - (_Src.ScalarLdFrag addr:$src2))))))>, + (_Src.VT (_Src.BroadcastLdFrag addr:$src2)))))>, EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -5149,15 +5161,13 @@ multiclass avx512_logical_lowering_bcast { // Register-broadcast logical operations. def : Pat<(IntInfo.VT (OpNode _.RC:$src1, - (bitconvert (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2)))))), + (bitconvert (_.VT (_.BroadcastLdFrag addr:$src2))))), (!cast(InstrStr#rmb) _.RC:$src1, addr:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, (bitconvert (_.VT - (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))))), + (_.BroadcastLdFrag addr:$src2)))))), _.RC:$src0)), (!cast(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, addr:$src2)>; @@ -5165,8 +5175,7 @@ multiclass avx512_logical_lowering_bcast(InstrStr#rmbkz) _.KRCWM:$mask, _.RC:$src1, addr:$src2)>; @@ -5447,8 +5456,7 @@ multiclass avx512_fp_packed opc, string OpcodeStr, SDPatternOperator OpN (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, - (OpNode _.RC:$src1, (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))>, + (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -5578,8 +5586,7 @@ multiclass avx512_fp_scalef_p opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, - (OpNode _.RC:$src1, (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))>, + (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -5752,7 +5759,7 @@ multiclass avx512_shift_rmbi opc, Format ImmFormM, defm mbi : AVX512_maskable, + (_.VT (OpNode (_.BroadcastLdFrag addr:$src1), (i8 timm:$src2)))>, EVEX_B, Sched<[sched.Folded]>; } @@ -5936,8 +5943,7 @@ multiclass avx512_var_shift_mb opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, - (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2)))))>, + (_.VT (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))>, AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6211,8 +6217,7 @@ multiclass avx512_permil_vec OpcVar, string OpcodeStr, SDNode OpNode, "$src1, ${src2}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src1, - (Ctrl.VT (X86VBroadcast - (Ctrl.ScalarLdFrag addr:$src2)))))>, + (Ctrl.VT (Ctrl.BroadcastLdFrag addr:$src2))))>, T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6402,7 +6407,7 @@ multiclass avx512_fma3p_213_rm opc, string OpcodeStr, SDNode OpNode, OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (OpNode _.RC:$src2, - _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>, + _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))), 1, 0>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -6476,7 +6481,7 @@ multiclass avx512_fma3p_231_rm opc, string OpcodeStr, SDNode OpNode, OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", "$src2, ${src3}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src2, - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6554,7 +6559,7 @@ multiclass avx512_fma3p_132_rm opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", "$src2, ${src3}"##_.BroadcastStr, - (_.VT (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1, _.RC:$src2)), 1, 0>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -6947,7 +6952,7 @@ multiclass avx512_pmadd52_rm opc, string OpcodeStr, SDNode OpNode, OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (OpNode _.RC:$src2, - (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))), + (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1)>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -7487,14 +7492,13 @@ multiclass avx512_vcvt_fp opc, string OpcodeStr, X86VectorVTInfo _, OpcodeStr, "${src}"##Broadcast, "${src}"##Broadcast, (_.VT (OpNode (_Src.VT - (X86VBroadcast (_Src.ScalarLdFrag addr:$src))) + (_Src.BroadcastLdFrag addr:$src)) )), (vselect MaskRC:$mask, (_.VT (OpNode (_Src.VT - (X86VBroadcast - (_Src.ScalarLdFrag addr:$src))))), + (_Src.BroadcastLdFrag addr:$src)))), _.RC:$src0), vselect, "$src0 = $dst">, EVEX, EVEX_B, Sched<[sched.Folded]>; @@ -7629,14 +7633,14 @@ let Predicates = [HasAVX512] in { v8f32x_info.ImmAllZerosV), (VCVTPD2PSZrmkz VK8WM:$mask, addr:$src)>; - def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcastld64 addr:$src)))), (VCVTPD2PSZrmb addr:$src)>; def : Pat<(vselect VK8WM:$mask, - (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))), + (fpround (v8f64 (X86VBroadcastld64 addr:$src))), (v8f32 VR256X:$src0)), (VCVTPD2PSZrmbk VR256X:$src0, VK8WM:$mask, addr:$src)>; def : Pat<(vselect VK8WM:$mask, - (fpround (v8f64 (X86VBroadcast (loadf64 addr:$src)))), + (fpround (v8f64 (X86VBroadcastld64 addr:$src))), v8f32x_info.ImmAllZerosV), (VCVTPD2PSZrmbkz VK8WM:$mask, addr:$src)>; } @@ -7660,14 +7664,14 @@ let Predicates = [HasVLX] in { v4f32x_info.ImmAllZerosV), (VCVTPD2PSZ256rmkz VK4WM:$mask, addr:$src)>; - def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))), (VCVTPD2PSZ256rmb addr:$src)>; def : Pat<(vselect VK4WM:$mask, - (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))), VR128X:$src0), (VCVTPD2PSZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; def : Pat<(vselect VK4WM:$mask, - (v4f32 (fpround (v4f64 (X86VBroadcast (loadf64 addr:$src))))), + (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))), v4f32x_info.ImmAllZerosV), (VCVTPD2PSZ256rmbkz VK4WM:$mask, addr:$src)>; @@ -7691,12 +7695,12 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTPD2PSZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(X86vfpround (v2f64 (X86VBroadcast (loadf64 addr:$src)))), + def : Pat<(X86vfpround (v2f64 (X86VBroadcastld64 addr:$src))), (VCVTPD2PSZ128rmb addr:$src)>; - def : Pat<(X86vmfpround (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)), (v4f32 VR128X:$src0), VK2WM:$mask), (VCVTPD2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86vmfpround (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)), v4f32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTPD2PSZ128rmbkz VK2WM:$mask, addr:$src)>; } @@ -8177,12 +8181,12 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4i32 (X86cvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTPD2DQZ128rmb addr:$src)>; - def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), (VCVTPD2DQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvtp2Int (v2f64 (X86VBroadcastld64 addr:$src)), v4i32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTPD2DQZ128rmbkz VK2WM:$mask, addr:$src)>; @@ -8206,12 +8210,12 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4i32 (X86cvttp2si (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTTPD2DQZ128rmb addr:$src)>; - def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), (VCVTTPD2DQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)), v4i32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTTPD2DQZ128rmbkz VK2WM:$mask, addr:$src)>; @@ -8235,12 +8239,12 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4i32 (X86cvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTPD2UDQZ128rmb addr:$src)>; - def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), (VCVTPD2UDQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvtp2UInt (v2f64 (X86VBroadcastld64 addr:$src)), v4i32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>; @@ -8264,12 +8268,12 @@ let Predicates = [HasVLX] in { VK2WM:$mask), (VCVTTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4i32 (X86cvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))))), + def : Pat<(v4i32 (X86cvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)))), (VCVTTPD2UDQZ128rmb addr:$src)>; - def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)), (v4i32 VR128X:$src0), VK2WM:$mask), (VCVTTPD2UDQZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)), v4i32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTTPD2UDQZ128rmbkz VK2WM:$mask, addr:$src)>; } @@ -8402,12 +8406,12 @@ let Predicates = [HasDQI, HasVLX] in { VK2WM:$mask), (VCVTQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))), + def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))), (VCVTQQ2PSZ128rmb addr:$src)>; - def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)), (v4f32 VR128X:$src0), VK2WM:$mask), (VCVTQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)), v4f32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>; @@ -8431,12 +8435,12 @@ let Predicates = [HasDQI, HasVLX] in { VK2WM:$mask), (VCVTUQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>; - def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))))), + def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))), (VCVTUQQ2PSZ128rmb addr:$src)>; - def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)), (v4f32 VR128X:$src0), VK2WM:$mask), (VCVTUQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>; - def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)), v4f32x_info.ImmAllZerosV, VK2WM:$mask), (VCVTUQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>; } @@ -8748,7 +8752,7 @@ multiclass avx512_fp14_p opc, string OpcodeStr, SDNode OpNode, (ins _.ScalarMemOp:$src), OpcodeStr, "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, (OpNode (_.VT - (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + (_.BroadcastLdFrag addr:$src)))>, EVEX, T8PD, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -8842,7 +8846,7 @@ multiclass avx512_fp28_p opc, string OpcodeStr, X86VectorVTInfo _, (ins _.ScalarMemOp:$src), OpcodeStr, "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, (OpNode (_.VT - (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + (_.BroadcastLdFrag addr:$src)))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -8923,7 +8927,7 @@ multiclass avx512_sqrt_packed opc, string OpcodeStr, (ins _.ScalarMemOp:$src), OpcodeStr, "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, (fsqrt (_.VT - (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + (_.BroadcastLdFrag addr:$src)))>, EVEX, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } } @@ -10103,7 +10107,7 @@ multiclass avx512_unary_fp_packed_imm opc, string OpcodeStr, SDNode OpNo (ins _.ScalarMemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2", - (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))), + (OpNode (_.VT (_.BroadcastLdFrag addr:$src1)), (i32 timm:$src2))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -10166,7 +10170,7 @@ multiclass avx512_fp_packed_imm opc, string OpcodeStr, SDNode OpNode, OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr##", $src3", (OpNode (_.VT _.RC:$src1), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + (_.VT (_.BroadcastLdFrag addr:$src2)), (i32 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -10209,7 +10213,7 @@ multiclass avx512_3Op_imm8 opc, string OpcodeStr, SDNode OpNode, OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr##", $src3", (OpNode (_.VT _.RC:$src1), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + (_.VT (_.BroadcastLdFrag addr:$src2)), (i8 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -10403,7 +10407,7 @@ multiclass avx512_shuff_packed_128_common opc, string OpcodeStr, (bitconvert (CastInfo.VT (X86Shuf128 _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + (_.BroadcastLdFrag addr:$src2), (i8 timm:$src3)))))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -10489,7 +10493,7 @@ multiclass avx512_valign opc, string OpcodeStr, OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr##", $src3", (X86VAlign _.RC:$src1, - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + (_.VT (_.BroadcastLdFrag addr:$src2)), (i8 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -10581,8 +10585,7 @@ multiclass avx512_vpalign_mask_lowering_mb : avx512_vpalign_mask_lowering { def : Pat<(From.VT (OpNode From.RC:$src1, - (bitconvert (To.VT (X86VBroadcast - (To.ScalarLdFrag addr:$src2)))), + (bitconvert (To.VT (To.BroadcastLdFrag addr:$src2))), timm:$src3)), (!cast(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2, (ImmXForm timm:$src3))>; @@ -10591,8 +10594,7 @@ multiclass avx512_vpalign_mask_lowering_mb(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask, @@ -10603,8 +10605,7 @@ multiclass avx512_vpalign_mask_lowering_mb(OpcodeStr#"rmbikz") To.KRCWM:$mask, @@ -10667,8 +10668,7 @@ multiclass avx512_unary_rmb opc, string OpcodeStr, SDNode OpNode, (ins _.ScalarMemOp:$src1), OpcodeStr, "${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr, - (_.VT (OpNode (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src1)))))>, + (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src1))))>, EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded]>; } @@ -10811,16 +10811,16 @@ defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup, // AVX-512 - MOVDDUP //===----------------------------------------------------------------------===// -multiclass avx512_movddup_128 opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_movddup_128 opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable, EVEX, + (_.VT (X86VBroadcast (_.VT _.RC:$src)))>, EVEX, Sched<[sched]>; defm rm : AVX512_maskable, + (_.VT (_.BroadcastLdFrag addr:$src))>, EVEX, EVEX_CD8<_.EltSize, CD8VH>, Sched<[sched.Folded]>; } @@ -10834,7 +10834,7 @@ multiclass avx512_movddup_common opc, string OpcodeStr, SDNode OpNode, let Predicates = [HasAVX512, HasVLX] in { defm Z256 : avx512_unary_rm, EVEX_V256; - defm Z128 : avx512_movddup_128, EVEX_V128; } } @@ -10863,10 +10863,10 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), immAllZerosV), (VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)), (v2f64 VR128X:$src0)), (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)), immAllZerosV), (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; @@ -11207,7 +11207,7 @@ multiclass avx512_ternlog opc, string OpcodeStr, SDNode OpNode, "$src2, ${src3}"##_.BroadcastStr##", $src4", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + (_.VT (_.BroadcastLdFrag addr:$src3)), (i8 timm:$src4)), 1, 0>, EVEX_B, AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -11285,12 +11285,12 @@ multiclass avx512_ternlog opc, string OpcodeStr, SDNode OpNode, _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 timm:$src4))>; // Additional patterns for matching broadcasts in other positions. - def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + def : Pat<(_.VT (OpNode (_.BroadcastLdFrag addr:$src3), _.RC:$src2, _.RC:$src1, (i8 timm:$src4))), (!cast(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (OpNode _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (_.BroadcastLdFrag addr:$src3), _.RC:$src2, (i8 timm:$src4))), (!cast(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; @@ -11298,7 +11298,7 @@ multiclass avx512_ternlog opc, string OpcodeStr, SDNode OpNode, // Additional patterns for matching zero masking with broadcasts in other // positions. def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (OpNode (_.BroadcastLdFrag addr:$src3), _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.ImmAllZerosV)), (!cast(Name#_.ZSuffix#rmbikz) _.RC:$src1, @@ -11306,7 +11306,7 @@ multiclass avx512_ternlog opc, string OpcodeStr, SDNode OpNode, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (_.BroadcastLdFrag addr:$src3), _.RC:$src2, (i8 timm:$src4)), _.ImmAllZerosV)), (!cast(Name#_.ZSuffix#rmbikz) _.RC:$src1, @@ -11316,33 +11316,32 @@ multiclass avx512_ternlog opc, string OpcodeStr, SDNode OpNode, // Additional patterns for matching masked broadcasts with different // operand orders. def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (OpNode _.RC:$src1, (_.BroadcastLdFrag addr:$src3), _.RC:$src2, (i8 timm:$src4)), _.RC:$src1)), (!cast(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (OpNode (_.BroadcastLdFrag addr:$src3), _.RC:$src2, _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src2, _.RC:$src1, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (_.BroadcastLdFrag addr:$src3), (i8 timm:$src4)), _.RC:$src1)), (!cast(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src2, - (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (_.BroadcastLdFrag addr:$src3), _.RC:$src1, (i8 timm:$src4)), _.RC:$src1)), (!cast(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (OpNode (_.BroadcastLdFrag addr:$src3), _.RC:$src1, _.RC:$src2, (i8 timm:$src4)), _.RC:$src1)), (!cast(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, @@ -11371,61 +11370,61 @@ defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU, // FIXME: Need better DAG canonicalization. let Predicates = [HasVLX] in { def : Pat<(X86vpternlog VR128X:$src1, VR128X:$src2, - (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src3)))), + (bc_v4i32 (v4f32 (X86VBroadcastld32 addr:$src3))), (i8 timm:$src4)), (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, timm:$src4)>; - def : Pat<(X86vpternlog (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src3)))), + def : Pat<(X86vpternlog (bc_v4i32 (v4f32 (X86VBroadcastld32 addr:$src3))), VR128X:$src2, VR128X:$src1, (i8 timm:$src4)), (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR128X:$src1, - (bc_v4i32 (v4f32 (X86VBroadcast (loadf32 addr:$src3)))), + (bc_v4i32 (v4f32 (X86VBroadcastld32 addr:$src3))), VR128X:$src2, (i8 timm:$src4)), (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR128X:$src1, VR128X:$src2, - (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src3)))), + (bc_v2i64 (v2f64 (X86VBroadcastld64 addr:$src3))), (i8 timm:$src4)), (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, timm:$src4)>; - def : Pat<(X86vpternlog (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src3)))), + def : Pat<(X86vpternlog (bc_v2i64 (v2f64 (X86VBroadcastld64 addr:$src3))), VR128X:$src2, VR128X:$src1, (i8 timm:$src4)), (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR128X:$src1, - (bc_v2i64 (v2f64 (X86VBroadcast (loadf64 addr:$src3)))), + (bc_v2i64 (v2f64 (X86VBroadcastld64 addr:$src3))), VR128X:$src2, (i8 timm:$src4)), (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR256X:$src1, VR256X:$src2, - (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src3)))), + (bc_v8i32 (v8f32 (X86VBroadcastld32 addr:$src3))), (i8 timm:$src4)), (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, timm:$src4)>; - def : Pat<(X86vpternlog (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src3)))), + def : Pat<(X86vpternlog (bc_v8i32 (v8f32 (X86VBroadcastld32 addr:$src3))), VR256X:$src2, VR256X:$src1, (i8 timm:$src4)), (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR256X:$src1, - (bc_v8i32 (v8f32 (X86VBroadcast (loadf32 addr:$src3)))), + (bc_v8i32 (v8f32 (X86VBroadcastld32 addr:$src3))), VR256X:$src2, (i8 timm:$src4)), (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR256X:$src1, VR256X:$src2, - (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src3)))), + (bc_v4i64 (v4f64 (X86VBroadcastld64 addr:$src3))), (i8 timm:$src4)), (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, timm:$src4)>; - def : Pat<(X86vpternlog (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src3)))), + def : Pat<(X86vpternlog (bc_v4i64 (v4f64 (X86VBroadcastld64 addr:$src3))), VR256X:$src2, VR256X:$src1, (i8 timm:$src4)), (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR256X:$src1, - (bc_v4i64 (v4f64 (X86VBroadcast (loadf64 addr:$src3)))), + (bc_v4i64 (v4f64 (X86VBroadcastld64 addr:$src3))), VR256X:$src2, (i8 timm:$src4)), (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; @@ -11433,31 +11432,31 @@ let Predicates = [HasVLX] in { let Predicates = [HasAVX512] in { def : Pat<(X86vpternlog VR512:$src1, VR512:$src2, - (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src3)))), + (bc_v16i32 (v16f32 (X86VBroadcastld32 addr:$src3))), (i8 timm:$src4)), (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, timm:$src4)>; - def : Pat<(X86vpternlog (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src3)))), + def : Pat<(X86vpternlog (bc_v16i32 (v16f32 (X86VBroadcastld32 addr:$src3))), VR512:$src2, VR512:$src1, (i8 timm:$src4)), (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR512:$src1, - (bc_v16i32 (v16f32 (X86VBroadcast (loadf32 addr:$src3)))), + (bc_v16i32 (v16f32 (X86VBroadcastld32 addr:$src3))), VR512:$src2, (i8 timm:$src4)), (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR512:$src1, VR512:$src2, - (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src3)))), + (bc_v8i64 (v8f64 (X86VBroadcastld64 addr:$src3))), (i8 timm:$src4)), (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, timm:$src4)>; - def : Pat<(X86vpternlog (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src3)))), + def : Pat<(X86vpternlog (bc_v8i64 (v8f64 (X86VBroadcastld64 addr:$src3))), VR512:$src2, VR512:$src1, (i8 timm:$src4)), (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>; def : Pat<(X86vpternlog VR512:$src1, - (bc_v8i64 (v8f64 (X86VBroadcast (loadf64 addr:$src3)))), + (bc_v8i64 (v8f64 (X86VBroadcastld64 addr:$src3))), VR512:$src2, (i8 timm:$src4)), (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>; @@ -11696,7 +11695,7 @@ multiclass avx512_fixupimm_packed opc, string OpcodeStr, "$src2, ${src3}"##_.BroadcastStr##", $src4", (X86VFixupimm (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (TblVT.VT (X86VBroadcast(TblVT.ScalarLdFrag addr:$src3))), + (TblVT.VT (TblVT.BroadcastLdFrag addr:$src3)), (i32 timm:$src4))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } // Constraints = "$src1 = $dst" @@ -11987,7 +11986,7 @@ multiclass VBMI2_shift_var_rmb Op, string OpStr, SDNode OpNode, "${src3}"##VTI.BroadcastStr##", $src2", "$src2, ${src3}"##VTI.BroadcastStr, (OpNode VTI.RC:$src1, VTI.RC:$src2, - (VTI.VT (X86VBroadcast (VTI.ScalarLdFrag addr:$src3))))>, + (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>, AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -12084,8 +12083,7 @@ multiclass VNNI_rmb Op, string OpStr, SDNode OpNode, OpStr, "${src3}"##VTI.BroadcastStr##", $src2", "$src2, ${src3}"##VTI.BroadcastStr, (OpNode VTI.RC:$src1, VTI.RC:$src2, - (VTI.VT (X86VBroadcast - (VTI.ScalarLdFrag addr:$src3))))>, + (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>, EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B, T8PD, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -12221,7 +12219,7 @@ multiclass GF2P8AFFINE_avx512_rmb_imm Op, string OpStr, SDNode OpNode, OpStr, "$src3, ${src2}"##BcstVTI.BroadcastStr##", $src1", "$src1, ${src2}"##BcstVTI.BroadcastStr##", $src3", (OpNode (VTI.VT VTI.RC:$src1), - (bitconvert (BcstVTI.VT (X86VBroadcast (loadi64 addr:$src2)))), + (bitconvert (BcstVTI.VT (X86VBroadcastld64 addr:$src2))), (i8 timm:$src3))>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } @@ -12333,7 +12331,7 @@ multiclass avx512_vp2intersect_modes { !strconcat("vp2intersect", _.Suffix, "\t{${src2}", _.BroadcastStr, ", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"), [(set _.KRPC:$dst, (X86vp2intersect - _.RC:$src1, (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))]>, + _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))]>, EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; } @@ -12434,12 +12432,12 @@ let Predicates = [HasBF16, HasVLX] in { (VCVTNEPS2BF16Z128rmkz VK4WM:$mask, addr:$src)>; def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32 - (X86VBroadcast (loadf32 addr:$src))))), + (X86VBroadcastld32 addr:$src)))), (VCVTNEPS2BF16Z128rmb addr:$src)>; - def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))), + def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)), (v8i16 VR128X:$src0), VK4WM:$mask), (VCVTNEPS2BF16Z128rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; - def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcast (loadf32 addr:$src))), + def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)), v8i16x_info.ImmAllZerosV, VK4WM:$mask), (VCVTNEPS2BF16Z128rmbkz VK4WM:$mask, addr:$src)>; } @@ -12466,7 +12464,7 @@ multiclass avx512_dpbf16ps_rm opc, string OpcodeStr, SDNode OpNode, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr), (_.VT (OpNode _.RC:$src1, _.RC:$src2, - (src_v.VT (X86VBroadcast(src_v.ScalarLdFrag addr:$src3)))))>, + (src_v.VT (src_v.BroadcastLdFrag addr:$src3))))>, EVEX_B, EVEX_4V; } diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index d75b492594b..de6f8a81dff 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -103,6 +103,8 @@ def X86vzld : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86VBroadcastld : SDNode<"X86ISD::VBROADCAST_LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisInt<1>, @@ -954,6 +956,26 @@ def X86vextractstore64 : PatFrag<(ops node:$val, node:$ptr), return cast(N)->getMemoryVT().getStoreSize() == 8; }]>; +def X86VBroadcastld8 : PatFrag<(ops node:$src), + (X86VBroadcastld node:$src), [{ + return cast(N)->getMemoryVT().getStoreSize() == 1; +}]>; + +def X86VBroadcastld16 : PatFrag<(ops node:$src), + (X86VBroadcastld node:$src), [{ + return cast(N)->getMemoryVT().getStoreSize() == 2; +}]>; + +def X86VBroadcastld32 : PatFrag<(ops node:$src), + (X86VBroadcastld node:$src), [{ + return cast(N)->getMemoryVT().getStoreSize() == 4; +}]>; + +def X86VBroadcastld64 : PatFrag<(ops node:$src), + (X86VBroadcastld node:$src), [{ + return cast(N)->getMemoryVT().getStoreSize() == 8; +}]>; + def fp32imm0 : PatLeaf<(f32 fpimm), [{ return N->isExactlyValue(+0.0); diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index eceace87c00..09a04c0338b 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -6911,10 +6911,10 @@ def : Pat<(nontemporalstore FR64:$src, addr:$dst), // class avx_broadcast_rm opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, ValueType VT, - PatFrag ld_frag, SchedWrite Sched> : + PatFrag bcast_frag, SchedWrite Sched> : AVX8I, + [(set RC:$dst, (VT (bcast_frag addr:$src)))]>, Sched<[Sched]>, VEX; // AVX2 adds register forms @@ -6927,15 +6927,15 @@ class avx2_broadcast_rr opc, string OpcodeStr, RegisterClass RC, let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in { def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, - f32mem, v4f32, loadf32, + f32mem, v4f32, X86VBroadcastld32, SchedWriteFShuffle.XMM.Folded>; def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, - f32mem, v8f32, loadf32, + f32mem, v8f32, X86VBroadcastld32, SchedWriteFShuffle.XMM.Folded>, VEX_L; } let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, - v4f64, loadf64, + v4f64, X86VBroadcastld64, SchedWriteFShuffle.XMM.Folded>, VEX_L; let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in { @@ -7406,7 +7406,7 @@ def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0 // destination operand // multiclass avx2_broadcast opc, string OpcodeStr, - X86MemOperand x86memop, PatFrag ld_frag, + X86MemOperand x86memop, PatFrag bcast_frag, ValueType OpVT128, ValueType OpVT256, Predicate prd> { let Predicates = [HasAVX2, prd] in { def rr : AVX28I opc, string OpcodeStr, def rm : AVX28I, + (OpVT128 (bcast_frag addr:$src)))]>, Sched<[SchedWriteShuffle.XMM.Folded]>, VEX; def Yrr : AVX28I opc, string OpcodeStr, def Yrm : AVX28I, + (OpVT256 (bcast_frag addr:$src)))]>, Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L; // Provide aliases for broadcast from the same register class that @@ -7438,13 +7438,13 @@ multiclass avx2_broadcast opc, string OpcodeStr, } } -defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, +defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8, v16i8, v32i8, NoVLX_Or_NoBWI>; -defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, +defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16, v8i16, v16i16, NoVLX_Or_NoBWI>; -defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, +defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32, v4i32, v8i32, NoVLX>; -defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, +defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64, v2i64, v4i64, NoVLX>; let Predicates = [HasAVX2, NoVLX] in { @@ -7453,6 +7453,12 @@ let Predicates = [HasAVX2, NoVLX] in { (VPBROADCASTQrm addr:$src)>; def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), (VPBROADCASTQYrm addr:$src)>; + + // FIXME this is to handle aligned extloads from i8/i16. + def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDrm addr:$src)>; + def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), + (VPBROADCASTDYrm addr:$src)>; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. @@ -7473,6 +7479,12 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (zextloadi16 addr:$src)))))), (VPBROADCASTWYrm addr:$src)>; + + // FIXME this is to handle aligned extloads from i8. + def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), + (VPBROADCASTWYrm addr:$src)>; } let Predicates = [HasAVX2, NoVLX] in { @@ -7518,11 +7530,11 @@ let Predicates = [HasAVX2, NoVLX] in { // AVX1 broadcast patterns let Predicates = [HasAVX1Only] in { -def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), +def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)), (VBROADCASTSSYrm addr:$src)>; -def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), +def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)), (VBROADCASTSDYrm addr:$src)>; -def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), +def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)), (VBROADCASTSSrm addr:$src)>; } @@ -7532,7 +7544,7 @@ let Predicates = [HasAVX, NoVLX] in { // 128bit broadcasts: def : Pat<(v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; - def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)), (VMOVDDUPrm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), @@ -7568,7 +7580,7 @@ let Predicates = [HasAVX1Only] in { def : Pat<(v2i64 (X86VBroadcast i64:$src)), (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>; - def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), + def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)), (VMOVDDUPrm addr:$src)>; } diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 20327658a7b..a526518c3fe 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -29,11 +29,10 @@ define i16 @unpckbw_test(i16 %a0, i16 %a1) { define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd %eax, %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x7c,0xc8] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x4c,0x24,0x01] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x7c,0xc0] -; X86-NEXT: vpbroadcastd %eax, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7c,0xd0] +; X86-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6f,0xc1] +; X86-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6f,0xd1] ; X86-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] ; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] ; X86-NEXT: retl ## encoding: [0xc3] diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index beea392fc17..0ca1f7be885 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -49,11 +49,10 @@ declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64) define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: ; X86: # %bb.0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastb %eax, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x7a,0xc8] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0x4c,0x24,0x04] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpbroadcastb %eax, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7a,0xc0] -; X86-NEXT: vpbroadcastb %eax, %zmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7a,0xd0] +; X86-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x6f,0xc1] +; X86-NEXT: vmovdqu8 %zmm1, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0xd1] ; X86-NEXT: vpaddb %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc2] ; X86-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] ; X86-NEXT: retl # encoding: [0xc3] @@ -79,11 +78,10 @@ declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastw %eax, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x7b,0xc8] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x79,0x4c,0x24,0x02] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpbroadcastw %eax, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7b,0xc0] -; X86-NEXT: vpbroadcastw %eax, %zmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7b,0xd0] +; X86-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc1] +; X86-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0xd1] ; X86-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] ; X86-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] ; X86-NEXT: retl # encoding: [0xc3] diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll index bd7969e9969..8907578cd33 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -7,11 +7,10 @@ declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16) define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128: ; X86: # %bb.0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastb %eax, %xmm1 # encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xc8] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0x4c,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpbroadcastb %eax, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc0] -; X86-NEXT: vpbroadcastb %eax, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xd0] +; X86-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x6f,0xc1] +; X86-NEXT: vmovdqu8 %xmm1, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x6f,0xd1] ; X86-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2] ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] ; X86-NEXT: retl # encoding: [0xc3] @@ -39,12 +38,11 @@ declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8) define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastw %eax, %xmm1 # encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xc8] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] -; X86-NEXT: vpbroadcastw %eax, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc0] -; X86-NEXT: vpbroadcastw %eax, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xd0] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x4c,0x24,0x04] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1] +; X86-NEXT: vmovdqu16 %xmm1, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0xd1] ; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; X86-NEXT: retl # encoding: [0xc3] @@ -72,11 +70,10 @@ define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16 define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256: ; X86: # %bb.0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastb %eax, %ymm1 # encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xc8] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0x4c,0x24,0x04] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpbroadcastb %eax, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc0] -; X86-NEXT: vpbroadcastb %eax, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xd0] +; X86-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x6f,0xc1] +; X86-NEXT: vmovdqu8 %ymm1, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0xd1] ; X86-NEXT: vpaddb %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc2] ; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] ; X86-NEXT: retl # encoding: [0xc3] @@ -105,11 +102,10 @@ declare <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16, <16 x i16>, i define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastw %eax, %ymm1 # encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xc8] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x4c,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpbroadcastw %eax, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc0] -; X86-NEXT: vpbroadcastw %eax, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xd0] +; X86-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xc1] +; X86-NEXT: vmovdqu16 %ymm1, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x6f,0xd1] ; X86-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] ; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; X86-NEXT: retl # encoding: [0xc3] diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 022ec6c6855..8684d1f568f 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -7,12 +7,11 @@ declare <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32, <4 x i32>, i8) define <4 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_128(i32 %x0, <4 x i32> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd %eax, %xmm1 # encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xc8] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vpbroadcastd %eax, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7c,0xc0] -; X86-NEXT: vpbroadcastd %eax, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7c,0xd0] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x4c,0x24,0x04] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc1] +; X86-NEXT: vmovdqa32 %xmm1, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x6f,0xd1] ; X86-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X86-NEXT: retl # encoding: [0xc3] @@ -72,12 +71,11 @@ define <2 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_128(i64 %x0, <2 x i64 define <8 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastd %eax, %ymm1 # encoding: [0x62,0xf2,0x7d,0x28,0x7c,0xc8] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vpbroadcastd %eax, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7c,0xc0] -; X86-NEXT: vpbroadcastd %eax, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7c,0xd0] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x4c,0x24,0x04] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6f,0xc1] +; X86-NEXT: vmovdqa32 %ymm1, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x6f,0xd1] ; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; X86-NEXT: retl # encoding: [0xc3] diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index 07e5f7a029c..af1abe71e2f 100644 --- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -496,12 +496,12 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) { ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: shrl $16, %edi ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: movl {{.*}}(%rip), %eax -; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k2} {z} +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i32_32i8: @@ -898,18 +898,18 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) { ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: kmovw %eax, %k3 ; AVX512F-NEXT: kmovw %edi, %k4 -; AVX512F-NEXT: movl {{.*}}(%rip), %eax -; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k4} {z} -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k3} {z} -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k4} {z} ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastd %eax, %zmm2 {%k2} {z} +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k3} {z} ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z} +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i64_64i8: -- 2.40.0