From fbcef4fae71e52778d4e81c09ecd9572af8bae48 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 14 Apr 2019 18:26:11 +0000 Subject: [PATCH] [X86] Move VPTESTM matching from the isel table to custom code in X86ISelDAGToDAG. We had many tablegen patterns for these instructions. And due to the commutability of the patterns, tablegen expands them to even more patterns. All together VPTESTMD patterns accounted for more the 50K of the 610K isel table. This had gotten bad when we stopped canonicalizing AND to vXi64. This required a pattern for every combination of bitcast input type. This change moves the matching to custom code where it is easier to look through the bitcasts without being concerned with the specific types. The test changes are because we are now stricter with one use checks as its required to make load folding legal. We now require the AND and any BITCAST to only have a single use. This prevents forming VPTESTM and a VPAND with the same inputs. We now support broadcast loads for 128/256 patterns without VLX. We'll widen to 512-bit like and still fold the broadcast since the amount of memory read doesn't change. There are a few tests that got slightly longer because are now prefering load + VPTESTM over XOR+VPCMPEQ for (seteq (load), allzeros). Previously we were able to share the XOR with multiple VPTESTM instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@358359 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelDAGToDAG.cpp | 361 ++++++++++ lib/Target/X86/X86InstrAVX512.td | 286 +------- test/CodeGen/X86/avx512-vec-cmp.ll | 4 +- test/CodeGen/X86/kshift.ll | 116 ++-- test/CodeGen/X86/movmsk-cmp.ll | 36 +- test/CodeGen/X86/prefer-avx256-mask-extend.ll | 72 +- .../CodeGen/X86/prefer-avx256-mask-shuffle.ll | 28 +- test/CodeGen/X86/setcc-lowering.ll | 3 +- test/CodeGen/X86/vector-fshl-128.ll | 494 +++++++------- test/CodeGen/X86/vector-fshl-256.ll | 488 +++++++------- test/CodeGen/X86/vector-fshl-512.ll | 632 +++++++++--------- test/CodeGen/X86/vector-fshr-128.ll | 494 +++++++------- test/CodeGen/X86/vector-fshr-256.ll | 488 +++++++------- test/CodeGen/X86/vector-fshr-512.ll | 560 ++++++++-------- test/CodeGen/X86/vector-lzcnt-512.ll | 76 +-- 15 files changed, 2041 insertions(+), 2097 deletions(-) diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index cce4d9bb382..82669300309 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -483,6 +483,7 @@ namespace { bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; bool tryShiftAmountMod(SDNode *N); + bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, const SDLoc &dl, MVT VT, SDNode *Node); @@ -3441,6 +3442,347 @@ bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) { return true; } +static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, + bool FoldedBCast, bool Masked) { + if (Masked) { + if (FoldedLoad) { + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v16i8: + return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk; + case MVT::v8i16: + return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk; + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk; + case MVT::v32i8: + return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk; + case MVT::v16i16: + return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk; + case MVT::v64i8: + return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk; + case MVT::v32i16: + return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk; + } + } + + if (FoldedBCast) { + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk; + } + } + + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v16i8: + return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk; + case MVT::v8i16: + return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk; + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk; + case MVT::v32i8: + return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk; + case MVT::v16i16: + return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk; + case MVT::v64i8: + return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk; + case MVT::v32i16: + return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk; + } + } + + if (FoldedLoad) { + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v16i8: + return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm; + case MVT::v8i16: + return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm; + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm; + case MVT::v32i8: + return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm; + case MVT::v16i16: + return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm; + case MVT::v64i8: + return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm; + case MVT::v32i16: + return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm; + } + } + + if (FoldedBCast) { + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb; + } + } + + switch (TestVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v16i8: + return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr; + case MVT::v8i16: + return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr; + case MVT::v4i32: + return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr; + case MVT::v2i64: + return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr; + case MVT::v32i8: + return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr; + case MVT::v16i16: + return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr; + case MVT::v8i32: + return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr; + case MVT::v4i64: + return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr; + case MVT::v64i8: + return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr; + case MVT::v32i16: + return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr; + case MVT::v16i32: + return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr; + case MVT::v8i64: + return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr; + } +} + +// Try to create VPTESTM instruction. If InMask is not null, it will be used +// to form a masked operation. +bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, + SDValue InMask) { + assert(Subtarget->hasAVX512() && "Expected AVX512!"); + assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 && + "Unexpected VT!"); + + // Look for equal and not equal compares. + ISD::CondCode CC = cast(Setcc.getOperand(2))->get(); + if (CC != ISD::SETEQ && CC != ISD::SETNE) + return false; + + // See if we're comparing against zero. This should have been canonicalized + // to RHS during lowering. + if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode())) + return false; + + SDValue N0 = Setcc.getOperand(0); + + MVT CmpVT = N0.getSimpleValueType(); + MVT CmpSVT = CmpVT.getVectorElementType(); + + // Start with both operands the same. We'll try to refine this. + SDValue Src0 = N0; + SDValue Src1 = N0; + + { + // Look through single use bitcasts. + SDValue N0Temp = N0; + if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse()) + N0Temp = N0.getOperand(0); + + // Look for single use AND. + if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) { + Src0 = N0Temp.getOperand(0); + Src1 = N0Temp.getOperand(1); + } + } + + // Without VLX we need to widen the load. + bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector(); + + // We can only fold loads if the sources are unique. + bool CanFoldLoads = Src0 != Src1; + + // Try to fold loads unless we need to widen. + bool FoldedLoad = false; + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load; + if (!Widen && CanFoldLoads) { + Load = Src1; + FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3, + Tmp4); + if (!FoldedLoad) { + // And is computative. + Load = Src0; + FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, + Tmp3, Tmp4); + if (FoldedLoad) + std::swap(Src0, Src1); + } + } + + auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) { + // Look through single use bitcasts. + if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse()) + Src = Src.getOperand(0); + + if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) { + Parent = Src.getNode(); + Src = Src.getOperand(0); + if (Src.getSimpleValueType() == CmpSVT) + return Src; + } + + return SDValue(); + }; + + // If we didn't fold a load, try to match broadcast. No widening limitation + // for this. But only 32 and 64 bit types are supported. + bool FoldedBCast = false; + if (!FoldedLoad && CanFoldLoads && + (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) { + SDNode *ParentNode; + if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) { + FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0, + Tmp1, Tmp2, Tmp3, Tmp4); + } + + // Try the other operand. + if (!FoldedBCast) { + if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) { + FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0, + Tmp1, Tmp2, Tmp3, Tmp4); + if (FoldedBCast) + std::swap(Src0, Src1); + } + } + } + + auto getMaskRC = [](MVT MaskVT) { + switch (MaskVT.SimpleTy) { + default: llvm_unreachable("Unexpected VT!"); + case MVT::v2i1: return X86::VK2RegClassID; + case MVT::v4i1: return X86::VK4RegClassID; + case MVT::v8i1: return X86::VK8RegClassID; + case MVT::v16i1: return X86::VK16RegClassID; + case MVT::v32i1: return X86::VK32RegClassID; + case MVT::v64i1: return X86::VK64RegClassID; + } + }; + + bool IsMasked = InMask.getNode() != nullptr; + + SDLoc dl(Root); + + MVT ResVT = Setcc.getSimpleValueType(); + MVT MaskVT = ResVT; + if (Widen) { + // Widen the inputs using insert_subreg or copy_to_regclass. + unsigned Scale = CmpVT.is128BitVector() ? 4 : 2; + unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm; + unsigned NumElts = CmpVT.getVectorNumElements() * Scale; + CmpVT = MVT::getVectorVT(CmpSVT, NumElts); + MaskVT = MVT::getVectorVT(MVT::i1, NumElts); + SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl, + CmpVT), 0); + Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0); + + assert(!FoldedLoad && "Shouldn't have folded the load"); + if (!FoldedBCast) + Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1); + + if (IsMasked) { + // Widen the mask. + unsigned RegClass = getMaskRC(MaskVT); + SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); + InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + dl, MaskVT, InMask, RC), 0); + } + } + + bool IsTestN = CC == ISD::SETEQ; + unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast, + IsMasked); + + MachineSDNode *CNode; + if (FoldedLoad || FoldedBCast) { + SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other); + + if (IsMasked) { + SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, + Load.getOperand(0) }; + CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + } else { + SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, + Load.getOperand(0) }; + CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); + } + + // Update the chain. + ReplaceUses(Load.getValue(1), SDValue(CNode, 1)); + // Record the mem-refs + CurDAG->setNodeMemRefs(CNode, {cast(Load)->getMemOperand()}); + } else { + if (IsMasked) + CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1); + else + CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1); + } + + // If we widened, we need to shrink the mask VT. + if (Widen) { + unsigned RegClass = getMaskRC(ResVT); + SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); + CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + dl, ResVT, SDValue(CNode, 0), RC); + } + + ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0)); + CurDAG->RemoveDeadNode(Root); + return true; +} + void X86DAGToDAGISel::Select(SDNode *Node) { MVT NVT = Node->getSimpleValueType(0); unsigned Opcode = Node->getOpcode(); @@ -3570,6 +3912,18 @@ void X86DAGToDAGISel::Select(SDNode *Node) { break; case ISD::AND: + if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { + // Try to form a masked VPTESTM. Operands can be in either order. + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && + tryVPTESTM(Node, N0, N1)) + return; + if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() && + tryVPTESTM(Node, N1, N0)) + return; + } + if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) { ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); CurDAG->RemoveDeadNode(Node); @@ -4207,6 +4561,13 @@ void X86DAGToDAGISel::Select(SDNode *Node) { return; } + case ISD::SETCC: { + if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue())) + return; + + break; + } + case ISD::STORE: if (foldLoadStoreIntoMemOperand(Node)) return; diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 4403f986b23..f86d9894df9 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -5813,309 +5813,93 @@ defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", // AVX-512 VPTESTM instructions //===----------------------------------------------------------------------===// -multiclass avx512_vptest opc, string OpcodeStr, PatFrag OpNode, - PatFrag OpNode_su, +multiclass avx512_vptest opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Name> { - let ExeDomain = _.ExeDomain in { + // NOTE: Patterns are omitted in favor of manual selection in X86ISelDAGToDAG. + // There are just too many permuations due to commutability and bitcasts. + let ExeDomain = _.ExeDomain, hasSideEffects = 0 in { let isCommutable = 1 in defm rr : AVX512_maskable_cmp, + (null_frag), (null_frag)>, EVEX_4V, Sched<[sched]>; + let mayLoad = 1 in defm rm : AVX512_maskable_cmp, + (null_frag), (null_frag)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } - - // Patterns for compare with 0 that just use the same source twice. - def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)), - (_.KVT (!cast(Name # _.ZSuffix # "rr") - _.RC:$src, _.RC:$src))>; - - def : Pat<(_.KVT (and _.KRC:$mask, (OpNode_su _.RC:$src, _.ImmAllZerosV))), - (_.KVT (!cast(Name # _.ZSuffix # "rrk") - _.KRC:$mask, _.RC:$src, _.RC:$src))>; } -multiclass avx512_vptest_mb opc, string OpcodeStr, PatFrag OpNode, - PatFrag OpNode_su, X86FoldableSchedWrite sched, - X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in +multiclass avx512_vptest_mb opc, string OpcodeStr, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in defm rmb : AVX512_maskable_cmp, + (null_frag), (null_frag)>, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } -// Use 512bit version to implement 128/256 bit in case NoVLX. -multiclass avx512_vptest_lowering { - def : Pat<(_.KVT (OpNode (and _.RC:$src1, _.RC:$src2), - _.ImmAllZerosV)), - (_.KVT (COPY_TO_REGCLASS - (!cast(Name # "Zrr") - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src1, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src2, _.SubRegIdx)), - _.KRC))>; - - def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode_su (and _.RC:$src1, _.RC:$src2), - _.ImmAllZerosV))), - (COPY_TO_REGCLASS - (!cast(Name # "Zrrk") - (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src1, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src2, _.SubRegIdx)), - _.KRC)>; - - def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)), - (_.KVT (COPY_TO_REGCLASS - (!cast(Name # "Zrr") - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src, _.SubRegIdx)), - _.KRC))>; - - def : Pat<(_.KVT (and _.KRC:$mask, (OpNode_su _.RC:$src, _.ImmAllZerosV))), - (COPY_TO_REGCLASS - (!cast(Name # "Zrrk") - (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src, _.SubRegIdx)), - _.KRC)>; -} - -multiclass avx512_vptest_dq_sizes opc, string OpcodeStr, PatFrag OpNode, - PatFrag OpNode_su, X86SchedWriteWidths sched, +multiclass avx512_vptest_dq_sizes opc, string OpcodeStr, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in - defm Z : avx512_vptest, - avx512_vptest_mb, EVEX_V512; + defm Z : avx512_vptest, + avx512_vptest_mb, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z256 : avx512_vptest, - avx512_vptest_mb, EVEX_V256; - defm Z128 : avx512_vptest, - avx512_vptest_mb, EVEX_V128; - } - let Predicates = [HasAVX512, NoVLX] in { - defm Z256_Alt : avx512_vptest_lowering< OpNode, OpNode_su, _.info512, _.info256, NAME>; - defm Z128_Alt : avx512_vptest_lowering< OpNode, OpNode_su, _.info512, _.info128, NAME>; + defm Z256 : avx512_vptest, + avx512_vptest_mb, EVEX_V256; + defm Z128 : avx512_vptest, + avx512_vptest_mb, EVEX_V128; } } -multiclass avx512_vptest_dq opc, string OpcodeStr, PatFrag OpNode, - PatFrag OpNode_su, X86SchedWriteWidths sched> { - defm D : avx512_vptest_dq_sizes opc, string OpcodeStr, + X86SchedWriteWidths sched> { + defm D : avx512_vptest_dq_sizes; - defm Q : avx512_vptest_dq_sizes, VEX_W; } multiclass avx512_vptest_wb opc, string OpcodeStr, - PatFrag OpNode, PatFrag OpNode_su, X86SchedWriteWidths sched> { let Predicates = [HasBWI] in { - defm WZ: avx512_vptest, EVEX_V512, VEX_W; - defm BZ: avx512_vptest, EVEX_V512; } let Predicates = [HasVLX, HasBWI] in { - defm WZ256: avx512_vptest, EVEX_V256, VEX_W; - defm WZ128: avx512_vptest, EVEX_V128, VEX_W; - defm BZ256: avx512_vptest, EVEX_V256; - defm BZ128: avx512_vptest, EVEX_V128; } - - let Predicates = [HasBWI, NoVLX] in { - defm BZ256_Alt : avx512_vptest_lowering; - defm BZ128_Alt : avx512_vptest_lowering; - defm WZ256_Alt : avx512_vptest_lowering; - defm WZ128_Alt : avx512_vptest_lowering; - } } -// These patterns are used to match vptestm/vptestnm. We don't treat pcmpeqm -// as commutable here because we already canonicalized all zeros vectors to the -// RHS during lowering. -def X86pcmpeqm : PatFrag<(ops node:$src1, node:$src2), - (setcc node:$src1, node:$src2, SETEQ)>; -def X86pcmpnem : PatFrag<(ops node:$src1, node:$src2), - (setcc node:$src1, node:$src2, SETNE)>; - -def X86pcmpeqm_su : PatFrag<(ops node:$src1, node:$src2), - (X86pcmpeqm node:$src1, node:$src2), [{ - return N->hasOneUse(); -}]>; -def X86pcmpnem_su : PatFrag<(ops node:$src1, node:$src2), - (X86pcmpnem node:$src1, node:$src2), [{ - return N->hasOneUse(); -}]>; - multiclass avx512_vptest_all_forms opc_wb, bits<8> opc_dq, string OpcodeStr, - PatFrag OpNode, PatFrag OpNode_su, X86SchedWriteWidths sched> : - avx512_vptest_wb, - avx512_vptest_dq; + avx512_vptest_wb, + avx512_vptest_dq; -defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86pcmpnem, - X86pcmpnem_su, SchedWriteVecLogic>, T8PD; -defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86pcmpeqm, - X86pcmpeqm_su, SchedWriteVecLogic>, T8XS; - - -multiclass avx512_vptest_lowering_pats { - def : Pat<(_.KVT (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), - _.ImmAllZerosV)), - (!cast(InstrStr # "rr") _.RC:$src1, _.RC:$src2)>; - - def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode_su (bitconvert - (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), - _.ImmAllZerosV))), - (!cast(InstrStr # "rrk") _.KRC:$mask, _.RC:$src1, - _.RC:$src2)>; - - def : Pat<(_.KVT (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, - (AndInfo.LdFrag addr:$src2)))), - _.ImmAllZerosV)), - (!cast(InstrStr # "rm") _.RC:$src1, addr:$src2)>; - - def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode_su (bitconvert - (AndInfo.VT (and _.RC:$src1, - (AndInfo.LdFrag addr:$src2)))), - _ .ImmAllZerosV))), - (!cast(InstrStr # "rmk") _.KRC:$mask, _.RC:$src1, - addr:$src2)>; -} - -// Patterns to use 512-bit instructions when 128/256 are not available. -multiclass avx512_vptest_lowering_wide_pats { - def : Pat<(_.KVT (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), - _.ImmAllZerosV)), - (_.KVT (COPY_TO_REGCLASS - (!cast(InstrStr#"rr") - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src1, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src2, _.SubRegIdx)), - _.KRC))>; - - def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode_su (bitconvert - (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), - _.ImmAllZerosV))), - (COPY_TO_REGCLASS - (!cast(InstrStr#"rrk") - (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src1, _.SubRegIdx), - (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), - _.RC:$src2, _.SubRegIdx)), - _.KRC)>; -} - -multiclass avx512_vptest_lowering_sizes { -let Predicates = [prd, HasVLX] in { - defm : avx512_vptest_lowering_pats; - defm : avx512_vptest_lowering_pats; -} -let Predicates = [prd] in { - defm : avx512_vptest_lowering_pats; -} - -let Predicates = [prd, NoVLX] in { - defm : avx512_vptest_lowering_wide_pats; - defm : avx512_vptest_lowering_wide_pats; -} -} - -multiclass avx512_vptest_lowering_types { - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; -} - -defm : avx512_vptest_lowering_types<"VPTESTM", X86pcmpnem, X86pcmpnem_su>; -defm : avx512_vptest_lowering_types<"VPTESTNM", X86pcmpeqm, X86pcmpeqm_su>; +defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", + SchedWriteVecLogic>, T8PD; +defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", + SchedWriteVecLogic>, T8XS; //===----------------------------------------------------------------------===// // AVX-512 Shift instructions diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll index 5b9b4005c95..2c7d63d5ab9 100644 --- a/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/test/CodeGen/X86/avx512-vec-cmp.ll @@ -853,8 +853,8 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x ; KNL-NEXT: vpmovzxwq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x34,0xd2] ; KNL-NEXT: ## zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero ; KNL-NEXT: vpsllq $63, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x73,0xf2,0x3f] -; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x48,0x27,0xca] -; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xc2,0x0f,0x01] +; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x58,0xc2,0x0f,0x01] +; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1 {%k1} ## encoding: [0x62,0xf2,0xed,0x49,0x27,0xca] ; KNL-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x65,0xc0] ; KNL-NEXT: retq ## encoding: [0xc3] ; diff --git a/test/CodeGen/X86/kshift.ll b/test/CodeGen/X86/kshift.ll index aaa5ff7567d..a12fd46b5af 100644 --- a/test/CodeGen/X86/kshift.ll +++ b/test/CodeGen/X86/kshift.ll @@ -10,8 +10,8 @@ define i8 @kshiftl_v8i1_1(<8 x i64> %x, <8 x i64> %y) { ; KNL-NEXT: movb $-2, %al ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1} +; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: # kill: def $al killed $al killed $eax ; KNL-NEXT: vzeroupper @@ -75,15 +75,15 @@ define i32 @kshiftl_v32i1_1(<32 x i16> %x, <32 x i16> %y) { ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[15],zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL-NEXT: kshiftlw $1, %k2, %k2 -; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; KNL-NEXT: kshiftlw $1, %k2, %k1 +; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm1 +; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k2 ; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm1 ; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k2} +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %ecx -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k2} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: shll $16, %eax ; KNL-NEXT: orl %ecx, %eax @@ -112,38 +112,38 @@ define i64 @kshiftl_v64i1_1(<64 x i8> %x, <64 x i8> %y) { ; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm5 -; KNL-NEXT: vptestmd %zmm5, %zmm5, %k3 +; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2 ; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm0 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k3 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z} ; KNL-NEXT: valignd {{.*#+}} zmm1 = zmm0[15],zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z} -; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[15],zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4 -; KNL-NEXT: kshiftlw $1, %k3, %k3 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm0 -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} +; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm5[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} +; KNL-NEXT: valignd {{.*#+}} zmm5 = zmm6[15],zmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; KNL-NEXT: kshiftlw $1, %k1, %k3 +; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3 +; KNL-NEXT: vextracti128 $1, %ymm3, %xmm6 +; KNL-NEXT: vpmovsxbd %xmm6, %zmm6 +; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1 +; KNL-NEXT: vpmovsxbd %xmm3, %zmm3 +; KNL-NEXT: vptestmd %zmm3, %zmm3, %k2 ; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2 ; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3 ; KNL-NEXT: vpmovsxbd %xmm3, %zmm3 +; KNL-NEXT: vptestmd %zmm3, %zmm3, %k4 ; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 ; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 {%k3} ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0 {%k4} +; KNL-NEXT: vptestmd %zmm5, %zmm5, %k0 {%k4} ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: orl %eax, %ecx @@ -182,8 +182,8 @@ define i8 @kshiftl_v8i1_7(<8 x i64> %x, <8 x i64> %y) { ; KNL-NEXT: movb $-128, %al ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1} +; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: # kill: def $al killed $al killed $eax ; KNL-NEXT: vzeroupper @@ -308,8 +308,8 @@ define i8 @kshiftr_v8i1_1(<8 x i64> %x, <8 x i64> %y) { ; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,2,3,4,5,6,7,15] ; KNL-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 -; KNL-NEXT: vptestmq %zmm3, %zmm3, %k1 -; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1} +; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vptestmq %zmm3, %zmm3, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: # kill: def $al killed $al killed $eax ; KNL-NEXT: vzeroupper @@ -373,10 +373,10 @@ define i32 @kshiftr_v32i1_1(<32 x i16> %x, <32 x i16> %y) { ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0] -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2 ; KNL-NEXT: kshiftrw $1, %k1, %k1 -; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm0 -; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm1 +; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k2 ; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm1 ; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1} @@ -411,44 +411,44 @@ define i64 @kshiftr_v64i1_1(<64 x i8> %x, <64 x i8> %y) { ; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1 ; KNL-NEXT: vextracti128 $1, %ymm1, %xmm5 ; KNL-NEXT: vpmovsxbd %xmm5, %zmm5 -; KNL-NEXT: vptestmd %zmm5, %zmm5, %k3 +; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1 ; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm5 ; KNL-NEXT: vptestmd %zmm5, %zmm5, %k2 ; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k3 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z} ; KNL-NEXT: valignd {{.*#+}} zmm5 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0] -; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1 -; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} -; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm5[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z} -; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4 -; KNL-NEXT: kshiftrw $1, %k3, %k3 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm0 -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} +; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm6[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] +; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} +; KNL-NEXT: valignd {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm6[0] +; KNL-NEXT: kshiftrw $1, %k1, %k3 +; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2 +; KNL-NEXT: vextracti128 $1, %ymm2, %xmm6 +; KNL-NEXT: vpmovsxbd %xmm6, %zmm6 +; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1 +; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2 ; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm2 ; KNL-NEXT: vpmovsxbd %xmm2, %zmm3 +; KNL-NEXT: vptestmd %zmm3, %zmm3, %k4 ; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2 ; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 ; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 {%k3} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: shll $16, %eax -; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0 {%k4} +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k4} ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: orl %eax, %ecx ; KNL-NEXT: shlq $32, %rcx ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k2} ; KNL-NEXT: kmovw %k0, %edx -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1} +; KNL-NEXT: vptestmd %zmm5, %zmm5, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: shll $16, %eax ; KNL-NEXT: orl %edx, %eax @@ -480,8 +480,8 @@ define i8 @kshiftr_v8i1_7(<8 x i64> %x, <8 x i64> %y) { ; KNL-NEXT: movb $-2, %al ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1} +; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: # kill: def $al killed $al killed $eax ; KNL-NEXT: vzeroupper @@ -605,8 +605,8 @@ define i8 @kshiftl_v8i1_zu123u56(<8 x i64> %x, <8 x i64> %y) { ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = <8,u,1,2,3,u,5,6> ; KNL-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 ; KNL-NEXT: vpsllq $63, %zmm3, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1} +; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: # kill: def $al killed $al killed $eax ; KNL-NEXT: vzeroupper @@ -636,8 +636,8 @@ define i8 @kshiftl_v8i1_u0123456(<8 x i64> %x, <8 x i64> %y) { ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: valignq {{.*#+}} zmm0 = zmm0[7,0,1,2,3,4,5,6] ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1} +; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: # kill: def $al killed $al killed $eax ; KNL-NEXT: vzeroupper @@ -669,8 +669,8 @@ define i8 @kshiftr_v8i1_1u3u567z(<8 x i64> %x, <8 x i64> %y) { ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = <1,u,3,u,5,6,7,15> ; KNL-NEXT: vpermi2q %zmm0, %zmm2, %zmm3 ; KNL-NEXT: vpsllq $63, %zmm3, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1} +; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: # kill: def $al killed $al killed $eax ; KNL-NEXT: vzeroupper @@ -700,8 +700,8 @@ define i8 @kshiftr_v8i1_234567uu(<8 x i64> %x, <8 x i64> %y) { ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,4,5,6,7,0,1] ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 -; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1} +; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: # kill: def $al killed $al killed $eax ; KNL-NEXT: vzeroupper diff --git a/test/CodeGen/X86/movmsk-cmp.ll b/test/CodeGen/X86/movmsk-cmp.ll index 01d5252c231..06862809786 100644 --- a/test/CodeGen/X86/movmsk-cmp.ll +++ b/test/CodeGen/X86/movmsk-cmp.ll @@ -2087,8 +2087,7 @@ define i1 @allones_v4i32_and1(<4 x i32> %arg) { ; KNL-LABEL: allones_v4i32_and1: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: andb $15, %al ; KNL-NEXT: cmpb $15, %al @@ -2131,8 +2130,7 @@ define i1 @allzeros_v4i32_and1(<4 x i32> %arg) { ; KNL-LABEL: allzeros_v4i32_and1: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $15, %al ; KNL-NEXT: sete %al @@ -2192,8 +2190,7 @@ define i1 @allones_v8i32_and1(<8 x i32> %arg) { ; KNL-LABEL: allones_v8i32_and1: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: cmpb $-1, %al ; KNL-NEXT: sete %al @@ -2253,8 +2250,7 @@ define i1 @allzeros_v8i32_and1(<8 x i32> %arg) { ; KNL-LABEL: allzeros_v8i32_and1: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb %al, %al ; KNL-NEXT: sete %al @@ -2573,8 +2569,7 @@ define i1 @allones_v4i64_and1(<4 x i64> %arg) { ; KNL-LABEL: allones_v4i64_and1: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: andb $15, %al ; KNL-NEXT: cmpb $15, %al @@ -2640,8 +2635,7 @@ define i1 @allzeros_v4i64_and1(<4 x i64> %arg) { ; KNL-LABEL: allzeros_v4i64_and1: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $15, %al ; KNL-NEXT: sete %al @@ -3686,8 +3680,7 @@ define i1 @allones_v4i32_and4(<4 x i32> %arg) { ; KNL-LABEL: allones_v4i32_and4: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4] -; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: andb $15, %al ; KNL-NEXT: cmpb $15, %al @@ -3730,8 +3723,7 @@ define i1 @allzeros_v4i32_and4(<4 x i32> %arg) { ; KNL-LABEL: allzeros_v4i32_and4: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4] -; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $15, %al ; KNL-NEXT: sete %al @@ -3791,8 +3783,7 @@ define i1 @allones_v8i32_and4(<8 x i32> %arg) { ; KNL-LABEL: allones_v8i32_and4: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] -; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: cmpb $-1, %al ; KNL-NEXT: sete %al @@ -3852,8 +3843,7 @@ define i1 @allzeros_v8i32_and4(<8 x i32> %arg) { ; KNL-LABEL: allzeros_v8i32_and4: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] -; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb %al, %al ; KNL-NEXT: sete %al @@ -4172,8 +4162,7 @@ define i1 @allones_v4i64_and4(<4 x i64> %arg) { ; KNL-LABEL: allones_v4i64_and4: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] -; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: andb $15, %al ; KNL-NEXT: cmpb $15, %al @@ -4239,8 +4228,7 @@ define i1 @allzeros_v4i64_and4(<4 x i64> %arg) { ; KNL-LABEL: allzeros_v4i64_and4: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] -; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0 +; KNL-NEXT: vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $15, %al ; KNL-NEXT: sete %al diff --git a/test/CodeGen/X86/prefer-avx256-mask-extend.ll b/test/CodeGen/X86/prefer-avx256-mask-extend.ll index b4d452f2d3e..2ee68e39b91 100644 --- a/test/CodeGen/X86/prefer-avx256-mask-extend.ll +++ b/test/CodeGen/X86/prefer-avx256-mask-extend.ll @@ -7,8 +7,8 @@ define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) { ; AVX256-LABEL: testv8i1_sext_v8i16: ; AVX256: # %bb.0: -; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 +; AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 @@ -17,8 +17,8 @@ define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) { ; ; AVX512VL-LABEL: testv8i1_sext_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 @@ -42,9 +42,10 @@ define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) { define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) { ; AVX256-LABEL: testv16i1_sext_v16i8: ; AVX256: # %bb.0: -; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2 +; AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; AVX256-NEXT: vmovdqa (%rsi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 @@ -56,9 +57,10 @@ define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) { ; ; AVX512VL-LABEL: testv16i1_sext_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 -; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1 +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0 +; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 @@ -88,9 +90,10 @@ define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) { define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) { ; AVX256-LABEL: testv16i1_sext_v16i16: ; AVX256: # %bb.0: -; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2 +; AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; AVX256-NEXT: vmovdqa (%rsi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 @@ -101,9 +104,10 @@ define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) { ; ; AVX512VL-LABEL: testv16i1_sext_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 -; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1 +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0 +; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 @@ -131,8 +135,8 @@ define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) { define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) { ; AVX256-LABEL: testv8i1_zext_v8i16: ; AVX256: # %bb.0: -; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 +; AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm0, %xmm0 @@ -142,8 +146,8 @@ define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) { ; ; AVX512VL-LABEL: testv8i1_zext_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 @@ -168,9 +172,10 @@ define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) { define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) { ; AVX256-LABEL: testv16i1_zext_v16i8: ; AVX256: # %bb.0: -; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2 +; AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; AVX256-NEXT: vmovdqa (%rsi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 @@ -184,9 +189,10 @@ define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) { ; ; AVX512VL-LABEL: testv16i1_zext_v16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 -; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1 +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0 +; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512VL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 @@ -216,9 +222,10 @@ define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) { define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) { ; AVX256-LABEL: testv16i1_zext_v16i16: ; AVX256: # %bb.0: -; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2 +; AVX256-NEXT: vmovdqa (%rdi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; AVX256-NEXT: vmovdqa (%rsi), %ymm0 +; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2 ; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} ; AVX256-NEXT: vpmovdw %ymm1, %xmm1 @@ -230,9 +237,10 @@ define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) { ; ; AVX512VL-LABEL: testv16i1_zext_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 -; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1 +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0 +; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 ; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 diff --git a/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll index 1983b7a638d..75564986809 100644 --- a/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll +++ b/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll @@ -11,9 +11,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8 x i32>* %b) { ; AVX256VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: ; AVX256VL: # %bb.0: -; AVX256VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX256VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; AVX256VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k2 +; AVX256VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX256VL-NEXT: vmovdqa (%rsi), %ymm1 +; AVX256VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; AVX256VL-NEXT: vptestnmd %ymm1, %ymm1, %k2 ; AVX256VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z} ; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1 @@ -42,9 +43,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8 ; ; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k2 +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; AVX512VL-NEXT: vptestnmd %ymm1, %ymm1, %k2 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] @@ -57,9 +59,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8 ; ; AVX256VLBW-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: ; AVX256VLBW: # %bb.0: -; AVX256VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX256VLBW-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 -; AVX256VLBW-NEXT: vpcmpeqd (%rsi), %ymm0, %k1 +; AVX256VLBW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX256VLBW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX256VLBW-NEXT: vptestnmd %ymm0, %ymm0, %k0 +; AVX256VLBW-NEXT: vptestnmd %ymm1, %ymm1, %k1 ; AVX256VLBW-NEXT: vpmovm2w %k1, %ymm0 ; AVX256VLBW-NEXT: vpmovm2w %k0, %ymm1 ; AVX256VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] @@ -71,9 +74,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8 ; ; AVX512VLBW-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpcmpeqd (%rdi), %ymm0, %k1 -; AVX512VLBW-NEXT: vpcmpeqd (%rsi), %ymm0, %k2 +; AVX512VLBW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VLBW-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512VLBW-NEXT: vptestnmd %ymm0, %ymm0, %k1 +; AVX512VLBW-NEXT: vptestnmd %ymm1, %ymm1, %k2 ; AVX512VLBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512VLBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0] diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll index 9bcb4fd1e63..1cf20bf62e4 100644 --- a/test/CodeGen/X86/setcc-lowering.ll +++ b/test/CodeGen/X86/setcc-lowering.ll @@ -24,8 +24,7 @@ define <8 x i16> @pr25080(<8 x i32> %a) { ; KNL-32-LABEL: pr25080: ; KNL-32: # %bb.0: # %entry ; KNL-32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [8388607,8388607,8388607,8388607,8388607,8388607,8388607,8388607] -; KNL-32-NEXT: vptestnmd %zmm1, %zmm0, %k0 +; KNL-32-NEXT: vptestnmd {{\.LCPI.*}}{1to16}, %zmm0, %k0 ; KNL-32-NEXT: movb $15, %al ; KNL-32-NEXT: kmovw %eax, %k1 ; KNL-32-NEXT: korw %k1, %k0, %k1 diff --git a/test/CodeGen/X86/vector-fshl-128.ll b/test/CodeGen/X86/vector-fshl-128.ll index 4a51e3341b2..e7e187629e5 100644 --- a/test/CodeGen/X86/vector-fshl-128.ll +++ b/test/CodeGen/X86/vector-fshl-128.ll @@ -108,16 +108,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512F-LABEL: var_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vpsllvq %xmm4, %xmm0, %xmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 -; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -125,30 +123,27 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512VL-LABEL: var_funnnel_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VL-NEXT: vpsllvq %xmm4, %xmm0, %xmm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 -; AVX512VL-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1 +; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} ; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpsllvq %xmm4, %xmm0, %xmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 -; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -156,16 +151,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512VBMI2-LABEL: var_funnnel_v2i64: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpsllvq %xmm4, %xmm0, %xmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpsllvq %xmm2, %xmm0, %xmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 -; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -173,14 +166,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v2i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpsllvq %xmm4, %xmm0, %xmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq @@ -358,16 +350,15 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; ; AVX512F-LABEL: var_funnnel_v4i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vpsllvd %xmm4, %xmm0, %xmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm3 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 -; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -375,30 +366,28 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; ; AVX512VL-LABEL: var_funnnel_v4i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VL-NEXT: vpsllvd %xmm4, %xmm0, %xmm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm3 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 -; AVX512VL-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1 +; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} ; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v4i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpsllvd %xmm4, %xmm0, %xmm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm3 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 -; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -406,16 +395,15 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; ; AVX512VBMI2-LABEL: var_funnnel_v4i32: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpsllvd %xmm4, %xmm0, %xmm5 -; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpsllvd %xmm2, %xmm0, %xmm3 +; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 -; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -423,14 +411,13 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v4i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpsllvd %xmm4, %xmm0, %xmm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm3 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq @@ -719,17 +706,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; ; AVX512BW-LABEL: var_funnnel_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -737,17 +722,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; ; AVX512VBMI2-LABEL: var_funnnel_v8i16: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -755,14 +738,13 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpsllvw %xmm4, %xmm0, %xmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq @@ -1040,21 +1022,19 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; ; AVX512BW-LABEL: var_funnnel_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512BW-NEXT: vpsllvw %zmm6, %zmm3, %zmm3 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -1062,21 +1042,19 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; ; AVX512VBMI2-LABEL: var_funnnel_v16i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512VBMI2-NEXT: vpsllvw %zmm6, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512VBMI2-NEXT: vpsrlvw %zmm5, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1084,19 +1062,18 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v16i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VLBW-NEXT: vpsllvw %ymm5, %ymm6, %ymm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm3, %ymm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512VLBW-NEXT: vpmovwb %ymm1, %xmm1 -; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLBW-NEXT: vzeroupper @@ -1104,19 +1081,18 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; ; AVX512VLVBMI2-LABEL: var_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VLVBMI2-NEXT: vpsllvw %ymm5, %ymm6, %ymm5 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm3, %ymm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 -; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm1, %xmm1 -; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1 +; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: vzeroupper @@ -1276,14 +1252,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vpsllq %xmm4, %xmm0, %xmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -1292,14 +1267,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; AVX512VL-LABEL: splatvar_funnnel_v2i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VL-NEXT: vpsllq %xmm4, %xmm0, %xmm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX512VL-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1 +; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} ; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VL-NEXT: retq @@ -1308,14 +1282,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpsllq %xmm4, %xmm0, %xmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -1325,14 +1298,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpsllq %xmm4, %xmm0, %xmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1341,14 +1313,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpsllq %xmm4, %xmm0, %xmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq @@ -1497,15 +1468,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512F-NEXT: vpslld %xmm5, %xmm0, %xmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512F-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512F-NEXT: vpsrld %xmm4, %xmm1, %xmm1 -; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512F-NEXT: vzeroupper @@ -1514,16 +1485,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; AVX512VL-LABEL: splatvar_funnnel_v4i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VL-NEXT: vpslld %xmm5, %xmm0, %xmm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VL-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VL-NEXT: vpsrld %xmm4, %xmm1, %xmm1 -; AVX512VL-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1 +; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} ; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VL-NEXT: retq @@ -1533,15 +1503,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512BW-NEXT: vpslld %xmm5, %xmm0, %xmm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512BW-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512BW-NEXT: vpsrld %xmm4, %xmm1, %xmm1 -; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -1552,15 +1522,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VBMI2-NEXT: vpslld %xmm5, %xmm0, %xmm5 -; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VBMI2-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VBMI2-NEXT: vpsrld %xmm4, %xmm1, %xmm1 -; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1569,16 +1539,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VLBW-NEXT: vpslld %xmm5, %xmm0, %xmm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VLBW-NEXT: vpslld %xmm3, %xmm0, %xmm3 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VLBW-NEXT: vpsrld %xmm4, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq @@ -1759,16 +1728,15 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vpsllw %xmm5, %xmm0, %xmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 -; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -1778,16 +1746,15 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VBMI2-NEXT: vpsllw %xmm5, %xmm0, %xmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VBMI2-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 -; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1796,16 +1763,15 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VLBW-NEXT: vpsllw %xmm5, %xmm0, %xmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm0, %xmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLBW-NEXT: retq @@ -2031,18 +1997,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512BW-NEXT: vpsllvw %zmm6, %zmm3, %zmm3 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512BW-NEXT: vzeroupper @@ -2053,18 +2018,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512VBMI2-NEXT: vpsllvw %zmm6, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512VBMI2-NEXT: vpsrlvw %zmm5, %zmm1, %zmm1 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -2073,19 +2037,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VLBW-NEXT: vpsllvw %ymm5, %ymm6, %ymm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm3, %ymm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512VLBW-NEXT: vpmovwb %ymm1, %xmm1 -; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLBW-NEXT: vzeroupper @@ -2094,19 +2057,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VLVBMI2-NEXT: vpsllvw %ymm5, %ymm6, %ymm5 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm3, %ymm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 -; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm1, %xmm1 -; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1 +; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: vzeroupper diff --git a/test/CodeGen/X86/vector-fshl-256.ll b/test/CodeGen/X86/vector-fshl-256.ll index 0133d9acca7..bf89b154930 100644 --- a/test/CodeGen/X86/vector-fshl-256.ll +++ b/test/CodeGen/X86/vector-fshl-256.ll @@ -71,76 +71,71 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; ; AVX512F-LABEL: var_funnnel_v4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpsllvq %ymm4, %ymm0, %ymm5 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64] -; AVX512F-NEXT: vpsubq %ymm4, %ymm6, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllvq %ymm2, %ymm0, %ymm3 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64] +; AVX512F-NEXT: vpsubq %ymm2, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vpsllvq %ymm4, %ymm0, %ymm5 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64] -; AVX512VL-NEXT: vpsubq %ymm4, %ymm6, %ymm4 +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllvq %ymm2, %ymm0, %ymm3 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64] +; AVX512VL-NEXT: vpsubq %ymm2, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VL-NEXT: vptestnmq %ymm3, %ymm2, %k1 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} ; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v4i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpsllvq %ymm4, %ymm0, %ymm5 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64] -; AVX512BW-NEXT: vpsubq %ymm4, %ymm6, %ymm4 +; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpsllvq %ymm2, %ymm0, %ymm3 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64] +; AVX512BW-NEXT: vpsubq %ymm2, %ymm4, %ymm4 ; AVX512BW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 -; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: var_funnnel_v4i64: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpsllvq %ymm4, %ymm0, %ymm5 -; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64] -; AVX512VBMI2-NEXT: vpsubq %ymm4, %ymm6, %ymm4 +; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpsllvq %ymm2, %ymm0, %ymm3 +; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64] +; AVX512VBMI2-NEXT: vpsubq %ymm2, %ymm4, %ymm4 ; AVX512VBMI2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 -; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: var_funnnel_v4i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpsllvq %ymm4, %ymm0, %ymm5 -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64] -; AVX512VLBW-NEXT: vpsubq %ymm4, %ymm6, %ymm4 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpsllvq %ymm2, %ymm0, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64] +; AVX512VLBW-NEXT: vpsubq %ymm2, %ymm4, %ymm4 ; AVX512VLBW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VLBW-NEXT: vptestnmq %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq @@ -257,76 +252,71 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) ; ; AVX512F-LABEL: var_funnnel_v8i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpsllvd %ymm4, %ymm0, %ymm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32] -; AVX512F-NEXT: vpsubd %ymm4, %ymm6, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm3 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32] +; AVX512F-NEXT: vpsubd %ymm2, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vpsllvd %ymm4, %ymm0, %ymm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32] -; AVX512VL-NEXT: vpsubd %ymm4, %ymm6, %ymm4 +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm3 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32] +; AVX512VL-NEXT: vpsubd %ymm2, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VL-NEXT: vptestnmd %ymm3, %ymm2, %k1 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} ; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v8i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpsllvd %ymm4, %ymm0, %ymm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpsubd %ymm4, %ymm6, %ymm4 +; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpsllvd %ymm2, %ymm0, %ymm3 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpsubd %ymm2, %ymm4, %ymm4 ; AVX512BW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 -; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: var_funnnel_v8i32: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpsllvd %ymm4, %ymm0, %ymm5 -; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32] -; AVX512VBMI2-NEXT: vpsubd %ymm4, %ymm6, %ymm4 +; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpsllvd %ymm2, %ymm0, %ymm3 +; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32] +; AVX512VBMI2-NEXT: vpsubd %ymm2, %ymm4, %ymm4 ; AVX512VBMI2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 -; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: var_funnnel_v8i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpsllvd %ymm4, %ymm0, %ymm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %ymm4, %ymm6, %ymm4 +; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpsllvd %ymm2, %ymm0, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %ymm2, %ymm4, %ymm4 ; AVX512VLBW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VLBW-NEXT: vptestnmd %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq @@ -514,48 +504,43 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; ; AVX512BW-LABEL: var_funnnel_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %ymm4, %ymm6, %ymm4 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %ymm2, %ymm4, %ymm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: var_funnnel_v16i16: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VBMI2-NEXT: vpsubw %ymm4, %ymm6, %ymm4 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %ymm2, %ymm4, %ymm4 ; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: var_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %ymm4, %ymm6, %ymm4 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %ymm2, %ymm4, %ymm4 ; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VLBW-NEXT: vptestnmw %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq @@ -792,80 +777,74 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; ; AVX512BW-LABEL: var_funnnel_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpsllvw %zmm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: var_funnnel_v32i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512VBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1 +; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: var_funnnel_v32i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VLBW-NEXT: vpsllvw %zmm5, %zmm6, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1 +; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VLBW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512VLBW-NEXT: vptestnmb %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: var_funnnel_v32i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VLVBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1 +; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512VLVBMI2-NEXT: vptestnmb %ymm3, %ymm2, %k1 +; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} ; AVX512VLVBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLVBMI2-NEXT: retq @@ -970,13 +949,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpsllq %xmm4, %ymm0, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512F-NEXT: retq @@ -984,14 +963,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; AVX512VL-LABEL: splatvar_funnnel_v4i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vpsllq %xmm4, %ymm0, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VL-NEXT: vptestnmq %ymm3, %ymm2, %k1 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} ; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VL-NEXT: retq @@ -1001,13 +979,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpsllq %xmm4, %ymm0, %ymm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 -; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq @@ -1017,13 +995,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpsllq %xmm4, %ymm0, %ymm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 -; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq @@ -1031,14 +1009,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpsllq %xmm4, %ymm0, %ymm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm0, %ymm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VLBW-NEXT: vptestnmq %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq @@ -1144,15 +1121,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512F-NEXT: vpslld %xmm5, %ymm0, %ymm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512F-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512F-NEXT: vpsrld %xmm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512F-NEXT: retq @@ -1160,16 +1137,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX512VL-LABEL: splatvar_funnnel_v8i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastd %xmm2, %ymm2 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VL-NEXT: vpslld %xmm5, %ymm0, %ymm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VL-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VL-NEXT: vpsrld %xmm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VL-NEXT: vptestnmd %ymm3, %ymm2, %k1 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} ; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VL-NEXT: retq @@ -1179,15 +1155,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512BW-NEXT: vpslld %xmm5, %ymm0, %ymm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512BW-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512BW-NEXT: vpsrld %xmm4, %ymm1, %ymm1 -; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq @@ -1197,15 +1173,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VBMI2-NEXT: vpslld %xmm5, %ymm0, %ymm5 -; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VBMI2-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VBMI2-NEXT: vpsrld %xmm4, %ymm1, %ymm1 -; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq @@ -1213,16 +1189,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %ymm2 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VLBW-NEXT: vpslld %xmm5, %ymm0, %ymm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VLBW-NEXT: vpslld %xmm3, %ymm0, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VLBW-NEXT: vpsrld %xmm4, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VLBW-NEXT: vptestnmd %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq @@ -1366,16 +1341,15 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vpsllw %xmm5, %ymm0, %ymm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 -; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq @@ -1384,16 +1358,15 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VBMI2-NEXT: vpsllw %xmm5, %ymm0, %ymm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VBMI2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 -; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq @@ -1401,16 +1374,15 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VLBW-NEXT: vpsllw %xmm5, %ymm0, %ymm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VLBW-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1 -; AVX512VLBW-NEXT: vptestnmw %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq @@ -1589,19 +1561,18 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpsllvw %zmm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1 +; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512BW-NEXT: retq @@ -1610,19 +1581,18 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512VBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1 +; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VBMI2-NEXT: retq @@ -1630,19 +1600,18 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VLBW-NEXT: vpsllvw %zmm5, %zmm6, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1 +; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VLBW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512VLBW-NEXT: vptestnmb %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLBW-NEXT: retq @@ -1650,19 +1619,18 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512VLVBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero ; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1 +; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512VLVBMI2-NEXT: vptestnmb %ymm3, %ymm2, %k1 +; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} ; AVX512VLVBMI2-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VLVBMI2-NEXT: retq diff --git a/test/CodeGen/X86/vector-fshl-512.ll b/test/CodeGen/X86/vector-fshl-512.ll index cb29f33b9e8..ca7a26a6b3e 100644 --- a/test/CodeGen/X86/vector-fshl-512.ll +++ b/test/CodeGen/X86/vector-fshl-512.ll @@ -18,42 +18,39 @@ declare <64 x i8> @llvm.fshl.v64i8(<64 x i8>, <64 x i8>, <64 x i8>) define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512F-NEXT: vpsllvq %zmm4, %zmm0, %zmm5 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64] -; AVX512F-NEXT: vpsubq %zmm4, %zmm6, %zmm4 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512F-NEXT: vpsllvq %zmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64] +; AVX512F-NEXT: vpsubq %zmm2, %zmm4, %zmm4 ; AVX512F-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1 -; AVX512F-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v8i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VL-NEXT: vpsllvq %zmm4, %zmm0, %zmm5 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64] -; AVX512VL-NEXT: vpsubq %zmm4, %zmm6, %zmm4 +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512VL-NEXT: vpsllvq %zmm2, %zmm0, %zmm3 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64] +; AVX512VL-NEXT: vpsubq %zmm2, %zmm4, %zmm4 ; AVX512VL-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1 -; AVX512VL-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512VL-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpsllvq %zmm4, %zmm0, %zmm5 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64] -; AVX512BW-NEXT: vpsubq %zmm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsllvq %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64] +; AVX512BW-NEXT: vpsubq %zmm2, %zmm4, %zmm4 ; AVX512BW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq @@ -65,14 +62,13 @@ define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v8i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpsllvq %zmm4, %zmm0, %zmm5 -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64] -; AVX512VLBW-NEXT: vpsubq %zmm4, %zmm6, %zmm4 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpsllvq %zmm2, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64] +; AVX512VLBW-NEXT: vpsubq %zmm2, %zmm4, %zmm4 ; AVX512VLBW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512VLBW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq @@ -88,42 +84,39 @@ define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512F-NEXT: vpsubd %zmm4, %zmm6, %zmm4 +; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512F-NEXT: vpsubd %zmm2, %zmm4, %zmm4 ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 -; AVX512F-NEXT: vpord %zmm1, %zmm5, %zmm1 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v16i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512VL-NEXT: vpsubd %zmm4, %zmm6, %zmm4 +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm3 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512VL-NEXT: vpsubd %zmm2, %zmm4, %zmm4 ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 -; AVX512VL-NEXT: vpord %zmm1, %zmm5, %zmm1 -; AVX512VL-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1 +; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpsllvd %zmm4, %zmm0, %zmm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpsubd %zmm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsllvd %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpsubd %zmm2, %zmm4, %zmm4 ; AVX512BW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vpord %zmm1, %zmm5, %zmm1 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpord %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq @@ -135,14 +128,13 @@ define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> % ; ; AVX512VLBW-LABEL: var_funnnel_v16i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpsllvd %zmm4, %zmm0, %zmm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %zmm4, %zmm6, %zmm4 +; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpsllvd %zmm2, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %zmm2, %zmm4, %zmm4 ; AVX512VLBW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpord %zmm1, %zmm5, %zmm1 -; AVX512VLBW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vpord %zmm1, %zmm3, %zmm1 +; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq @@ -220,14 +212,13 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> % ; ; AVX512BW-LABEL: var_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %zmm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %zmm2, %zmm4, %zmm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq @@ -239,14 +230,13 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> % ; ; AVX512VLBW-LABEL: var_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm5 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %zmm4, %zmm6, %zmm4 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %zmm2, %zmm4, %zmm4 ; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512VLBW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq @@ -398,160 +388,156 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; ; AVX512BW-LABEL: var_funnnel_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %zmm4, %zmm5, %zmm5 -; AVX512BW-NEXT: vpsllw $5, %zmm5, %zmm5 -; AVX512BW-NEXT: vpaddb %zmm5, %zmm5, %zmm6 -; AVX512BW-NEXT: vpmovb2m %zmm6, %k1 -; AVX512BW-NEXT: vpmovb2m %zmm5, %k2 -; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm5 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 -; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k2} -; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm5 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 -; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1} -; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm5 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %zmm2, %zmm3, %zmm3 +; AVX512BW-NEXT: vpsllw $5, %zmm3, %zmm3 +; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm4 +; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 +; AVX512BW-NEXT: vpmovb2m %zmm3, %k2 +; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm3 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2} +; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1} +; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm3 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1} +; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm4 +; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 +; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1} +; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm5 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 -; AVX512BW-NEXT: vpaddb %zmm6, %zmm6, %zmm6 -; AVX512BW-NEXT: vpmovb2m %zmm6, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1} -; AVX512BW-NEXT: vpsllw $5, %zmm4, %zmm4 -; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm5 -; AVX512BW-NEXT: vpmovb2m %zmm5, %k1 -; AVX512BW-NEXT: vpmovb2m %zmm4, %k2 -; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm4 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 -; AVX512BW-NEXT: vpblendmb %zmm4, %zmm0, %zmm4 {%k2} -; AVX512BW-NEXT: vpsllw $2, %zmm4, %zmm6 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6 -; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1} -; AVX512BW-NEXT: vpaddb %zmm5, %zmm5, %zmm5 -; AVX512BW-NEXT: vpmovb2m %zmm5, %k1 -; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4 {%k1} -; AVX512BW-NEXT: vporq %zmm1, %zmm4, %zmm1 -; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 +; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} +; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: var_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %zmm4, %zmm5, %zmm5 -; AVX512VBMI2-NEXT: vpsllw $5, %zmm5, %zmm5 -; AVX512VBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm6 -; AVX512VBMI2-NEXT: vpmovb2m %zmm6, %k1 -; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k2 -; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm5 -; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 -; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k2} -; AVX512VBMI2-NEXT: vpsrlw $2, %zmm1, %zmm5 -; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 -; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1} -; AVX512VBMI2-NEXT: vpsrlw $1, %zmm1, %zmm5 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %zmm2, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpsllw $5, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4 +; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k2 +; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2} +; AVX512VBMI2-NEXT: vpsrlw $2, %zmm1, %zmm3 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1} +; AVX512VBMI2-NEXT: vpsrlw $1, %zmm1, %zmm3 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1} +; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm4 +; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1} +; AVX512VBMI2-NEXT: vpsllw $2, %zmm3, %zmm5 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 -; AVX512VBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm6 -; AVX512VBMI2-NEXT: vpmovb2m %zmm6, %k1 -; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1} -; AVX512VBMI2-NEXT: vpsllw $5, %zmm4, %zmm4 -; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm5 -; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k1 -; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k2 -; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm4 -; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 -; AVX512VBMI2-NEXT: vpblendmb %zmm4, %zmm0, %zmm4 {%k2} -; AVX512VBMI2-NEXT: vpsllw $2, %zmm4, %zmm6 -; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6 -; AVX512VBMI2-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1} -; AVX512VBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm5 -; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k1 -; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 {%k1} -; AVX512VBMI2-NEXT: vporq %zmm1, %zmm4, %zmm1 -; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} +; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: var_funnnel_v64i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %zmm4, %zmm5, %zmm5 -; AVX512VLBW-NEXT: vpsllw $5, %zmm5, %zmm5 -; AVX512VLBW-NEXT: vpaddb %zmm5, %zmm5, %zmm6 -; AVX512VLBW-NEXT: vpmovb2m %zmm6, %k1 -; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k2 -; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm5 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 -; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k2} -; AVX512VLBW-NEXT: vpsrlw $2, %zmm1, %zmm5 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 -; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1} -; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm5 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %zmm2, %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpsllw $5, %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm4 +; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k2 +; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm3 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2} +; AVX512VLBW-NEXT: vpsrlw $2, %zmm1, %zmm3 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1} +; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm3 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1} +; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm4 +; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1} +; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm5 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 -; AVX512VLBW-NEXT: vpaddb %zmm6, %zmm6, %zmm6 -; AVX512VLBW-NEXT: vpmovb2m %zmm6, %k1 -; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1} -; AVX512VLBW-NEXT: vpsllw $5, %zmm4, %zmm4 -; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm5 -; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k1 -; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k2 -; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm4 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 -; AVX512VLBW-NEXT: vpblendmb %zmm4, %zmm0, %zmm4 {%k2} -; AVX512VLBW-NEXT: vpsllw $2, %zmm4, %zmm6 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6 -; AVX512VLBW-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1} -; AVX512VLBW-NEXT: vpaddb %zmm5, %zmm5, %zmm5 -; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k1 -; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4 {%k1} -; AVX512VLBW-NEXT: vporq %zmm1, %zmm4, %zmm1 -; AVX512VLBW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} +; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: var_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %zmm4, %zmm5, %zmm5 -; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm5, %zmm5 -; AVX512VLVBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm6 -; AVX512VLVBMI2-NEXT: vpmovb2m %zmm6, %k1 -; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k2 -; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm5 -; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 -; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k2} -; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm1, %zmm5 -; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 -; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1} -; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm1, %zmm5 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %zmm2, %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k2 +; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2} +; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm1, %zmm3 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1} +; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm1, %zmm3 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1} +; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm4 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1} +; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm3, %zmm5 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 -; AVX512VLVBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm6 -; AVX512VLVBMI2-NEXT: vpmovb2m %zmm6, %k1 -; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1} -; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm4, %zmm4 -; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm5 -; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k1 -; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k2 -; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm4 -; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 -; AVX512VLVBMI2-NEXT: vpblendmb %zmm4, %zmm0, %zmm4 {%k2} -; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm4, %zmm6 -; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6 -; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1} -; AVX512VLVBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm5 -; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k1 -; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 {%k1} -; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm4, %zmm1 -; AVX512VLVBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1} +; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VLVBMI2-NEXT: retq @@ -567,14 +553,13 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; AVX512F-LABEL: splatvar_funnnel_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512F-NEXT: vpsllq %xmm4, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512F-NEXT: vpsllq %xmm2, %zmm0, %zmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 -; AVX512F-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq @@ -582,14 +567,13 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; AVX512VL-LABEL: splatvar_funnnel_v8i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VL-NEXT: vpsllq %xmm4, %zmm0, %zmm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512VL-NEXT: vpsllq %xmm2, %zmm0, %zmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 -; AVX512VL-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512VL-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VL-NEXT: retq @@ -597,14 +581,13 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; AVX512BW-LABEL: splatvar_funnnel_v8i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpsllq %xmm4, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsllq %xmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq @@ -618,14 +601,13 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; AVX512VLBW-LABEL: splatvar_funnnel_v8i64: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpsllq %xmm4, %zmm0, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpsllq %xmm2, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512VLBW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq @@ -644,16 +626,15 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i ; AVX512F-LABEL: splatvar_funnnel_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512F-NEXT: vpslld %xmm5, %zmm0, %zmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512F-NEXT: vpslld %xmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512F-NEXT: vpsrld %xmm4, %zmm1, %zmm1 -; AVX512F-NEXT: vpord %zmm1, %zmm5, %zmm1 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq @@ -661,16 +642,15 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i ; AVX512VL-LABEL: splatvar_funnnel_v16i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VL-NEXT: vpslld %xmm5, %zmm0, %zmm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VL-NEXT: vpslld %xmm3, %zmm0, %zmm3 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VL-NEXT: vpsrld %xmm4, %zmm1, %zmm1 -; AVX512VL-NEXT: vpord %zmm1, %zmm5, %zmm1 -; AVX512VL-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1 +; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VL-NEXT: retq @@ -678,16 +658,15 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i ; AVX512BW-LABEL: splatvar_funnnel_v16i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512BW-NEXT: vpslld %xmm5, %zmm0, %zmm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512BW-NEXT: vpslld %xmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512BW-NEXT: vpsrld %xmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vpord %zmm1, %zmm5, %zmm1 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpord %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq @@ -701,16 +680,15 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i ; AVX512VLBW-LABEL: splatvar_funnnel_v16i32: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VLBW-NEXT: vpslld %xmm5, %zmm0, %zmm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VLBW-NEXT: vpslld %xmm3, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VLBW-NEXT: vpsrld %xmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpord %zmm1, %zmm5, %zmm1 -; AVX512VLBW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vpord %zmm1, %zmm3, %zmm1 +; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq @@ -769,16 +747,15 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastw %xmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vpsllw %xmm5, %zmm0, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq @@ -792,16 +769,15 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VLBW-NEXT: vpsllw %xmm5, %zmm0, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512VLBW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq @@ -880,24 +856,23 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512BW-LABEL: splatvar_funnnel_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsllw %xmm5, %zmm0, %zmm6 -; AVX512BW-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 -; AVX512BW-NEXT: vpsllw %xmm5, %zmm7, %zmm5 -; AVX512BW-NEXT: vpbroadcastb %xmm5, %zmm5 -; AVX512BW-NEXT: vpandq %zmm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm4 +; AVX512BW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 +; AVX512BW-NEXT: vpsllw %xmm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw %xmm4, %zmm7, %zmm4 +; AVX512BW-NEXT: vpsrlw %xmm4, %zmm5, %zmm4 ; AVX512BW-NEXT: vpsrlw $8, %zmm4, %zmm4 ; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4 ; AVX512BW-NEXT: vpandq %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq @@ -905,24 +880,23 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VBMI2-NEXT: vpsllw %xmm5, %zmm0, %zmm6 -; AVX512VBMI2-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 -; AVX512VBMI2-NEXT: vpsllw %xmm5, %zmm7, %zmm5 -; AVX512VBMI2-NEXT: vpbroadcastb %xmm5, %zmm5 -; AVX512VBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4 +; AVX512VBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 +; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm5, %zmm3 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 -; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm7, %zmm4 +; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm5, %zmm4 ; AVX512VBMI2-NEXT: vpsrlw $8, %zmm4, %zmm4 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4 ; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1 -; AVX512VBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VBMI2-NEXT: retq @@ -930,24 +904,23 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsllw %xmm5, %zmm0, %zmm6 -; AVX512VLBW-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 -; AVX512VLBW-NEXT: vpsllw %xmm5, %zmm7, %zmm5 -; AVX512VLBW-NEXT: vpbroadcastb %xmm5, %zmm5 -; AVX512VLBW-NEXT: vpandq %zmm5, %zmm6, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm4 +; AVX512VLBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 +; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm5, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm7, %zmm4 +; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm5, %zmm4 ; AVX512VLBW-NEXT: vpsrlw $8, %zmm4, %zmm4 ; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4 ; AVX512VLBW-NEXT: vpandq %zmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512VLBW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VLBW-NEXT: retq @@ -955,24 +928,23 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLVBMI2-NEXT: vpsllw %xmm5, %zmm0, %zmm6 -; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 -; AVX512VLVBMI2-NEXT: vpsllw %xmm5, %zmm7, %zmm5 -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm5, %zmm5 -; AVX512VLVBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4 +; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 +; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm5, %zmm3 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 -; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm7, %zmm4 +; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm5, %zmm4 ; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm4, %zmm4 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4 ; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1 -; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1 -; AVX512VLVBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512VLVBMI2-NEXT: retq diff --git a/test/CodeGen/X86/vector-fshr-128.ll b/test/CodeGen/X86/vector-fshr-128.ll index 1f70fc95277..7c31f9c296e 100644 --- a/test/CodeGen/X86/vector-fshr-128.ll +++ b/test/CodeGen/X86/vector-fshr-128.ll @@ -110,16 +110,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512F-LABEL: var_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsllvq %xmm4, %xmm0, %xmm0 -; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -127,29 +125,26 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512VL-LABEL: var_funnnel_v2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsllvq %xmm4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1 +; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0 -; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -157,16 +152,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512VBMI2-LABEL: var_funnnel_v2i64: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -174,14 +167,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v2i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -363,16 +355,15 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; ; AVX512F-LABEL: var_funnnel_v4i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsllvd %xmm4, %xmm0, %xmm0 -; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -380,29 +371,27 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; ; AVX512VL-LABEL: var_funnnel_v4i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsllvd %xmm4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1 +; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v4i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0 -; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -410,16 +399,15 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; ; AVX512VBMI2-LABEL: var_funnnel_v4i32: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5 -; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -427,14 +415,13 @@ define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v4i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -727,17 +714,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; ; AVX512BW-LABEL: var_funnnel_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -745,17 +730,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; ; AVX512VBMI2-LABEL: var_funnnel_v8i16: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -763,14 +746,13 @@ define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpsrlvw %xmm2, %xmm1, %xmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsllvw %xmm4, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -1056,21 +1038,19 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; ; AVX512BW-LABEL: var_funnnel_v16i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512BW-NEXT: vpsllvw %zmm5, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1078,21 +1058,19 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; ; AVX512VBMI2-LABEL: var_funnnel_v16i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512VBMI2-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1100,38 +1078,36 @@ define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v16i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VLBW-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VLBW-NEXT: vzeroupper ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: var_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VLVBMI2-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm0, %ymm0 -; AVX512VLVBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1 +; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VLVBMI2-NEXT: vzeroupper ; AVX512VLVBMI2-NEXT: retq @@ -1294,14 +1270,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512F-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsllq %xmm4, %xmm0, %xmm0 -; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -1310,14 +1285,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; AVX512VL-LABEL: splatvar_funnnel_v2i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsllq %xmm4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1 +; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq ; @@ -1325,14 +1299,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsllq %xmm4, %xmm0, %xmm0 -; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1342,14 +1315,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsllq %xmm4, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1358,14 +1330,13 @@ define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> % ; AVX512VLBW-LABEL: splatvar_funnnel_v2i64: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpsrlq %xmm2, %xmm1, %xmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsllq %xmm4, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -1516,15 +1487,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512F-NEXT: vpsrld %xmm5, %xmm1, %xmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512F-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512F-NEXT: vpslld %xmm4, %xmm0, %xmm0 -; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -1533,16 +1504,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; AVX512VL-LABEL: splatvar_funnnel_v4i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VL-NEXT: vpsrld %xmm5, %xmm1, %xmm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VL-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VL-NEXT: vpslld %xmm4, %xmm0, %xmm0 -; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1 +; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq ; @@ -1551,15 +1521,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512BW-NEXT: vpsrld %xmm5, %xmm1, %xmm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512BW-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512BW-NEXT: vpslld %xmm4, %xmm0, %xmm0 -; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1570,15 +1540,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VBMI2-NEXT: vpsrld %xmm5, %xmm1, %xmm5 -; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VBMI2-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VBMI2-NEXT: vpslld %xmm4, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1587,16 +1557,15 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % ; AVX512VLBW-LABEL: splatvar_funnnel_v4i32: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VLBW-NEXT: vpsrld %xmm5, %xmm1, %xmm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VLBW-NEXT: vpsrld %xmm3, %xmm1, %xmm3 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VLBW-NEXT: vpslld %xmm4, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -1779,16 +1748,15 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm5, %xmm1, %xmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm4, %xmm0, %xmm0 -; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1798,16 +1766,15 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VBMI2-NEXT: vpsrlw %xmm5, %xmm1, %xmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VBMI2-NEXT: vpsllw %xmm4, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -1816,16 +1783,15 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm5, %xmm1, %xmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm1, %xmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsllw %xmm4, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -2053,18 +2019,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512BW-NEXT: vpsllvw %zmm5, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -2075,18 +2040,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512VBMI2-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero +; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper @@ -2095,19 +2059,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VLBW-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1 +; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VLBW-NEXT: vzeroupper ; AVX512VLBW-NEXT: retq @@ -2115,19 +2078,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> % ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VLVBMI2-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm0, %ymm0 -; AVX512VLVBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0 +; AVX512VLVBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1 +; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VLVBMI2-NEXT: vzeroupper ; AVX512VLVBMI2-NEXT: retq diff --git a/test/CodeGen/X86/vector-fshr-256.ll b/test/CodeGen/X86/vector-fshr-256.ll index 64283a6603b..1e55383a492 100644 --- a/test/CodeGen/X86/vector-fshr-256.ll +++ b/test/CodeGen/X86/vector-fshr-256.ll @@ -71,75 +71,70 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) ; ; AVX512F-LABEL: var_funnnel_v4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpsrlvq %ymm4, %ymm1, %ymm5 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64] -; AVX512F-NEXT: vpsubq %ymm4, %ymm6, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64] +; AVX512F-NEXT: vpsubq %ymm2, %ymm4, %ymm4 ; AVX512F-NEXT: vpsllvq %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vpsrlvq %ymm4, %ymm1, %ymm5 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64] -; AVX512VL-NEXT: vpsubq %ymm4, %ymm6, %ymm4 +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64] +; AVX512VL-NEXT: vpsubq %ymm2, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsllvq %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vptestnmq %ymm3, %ymm2, %k1 +; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v4i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm5 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64] -; AVX512BW-NEXT: vpsubq %ymm4, %ymm6, %ymm4 +; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64] +; AVX512BW-NEXT: vpsubq %ymm2, %ymm4, %ymm4 ; AVX512BW-NEXT: vpsllvq %ymm4, %ymm0, %ymm0 -; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: var_funnnel_v4i64: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm5 -; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64] -; AVX512VBMI2-NEXT: vpsubq %ymm4, %ymm6, %ymm4 +; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3 +; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64] +; AVX512VBMI2-NEXT: vpsubq %ymm2, %ymm4, %ymm4 ; AVX512VBMI2-NEXT: vpsllvq %ymm4, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: var_funnnel_v4i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm5 -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64] -; AVX512VLBW-NEXT: vpsubq %ymm4, %ymm6, %ymm4 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64] +; AVX512VLBW-NEXT: vpsubq %ymm2, %ymm4, %ymm4 ; AVX512VLBW-NEXT: vpsllvq %ymm4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vptestnmq %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -258,75 +253,70 @@ define <8 x i32> @var_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %amt) ; ; AVX512F-LABEL: var_funnnel_v8i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32] -; AVX512F-NEXT: vpsubd %ymm4, %ymm6, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32] +; AVX512F-NEXT: vpsubd %ymm2, %ymm4, %ymm4 ; AVX512F-NEXT: vpsllvd %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32] -; AVX512VL-NEXT: vpsubd %ymm4, %ymm6, %ymm4 +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32] +; AVX512VL-NEXT: vpsubd %ymm2, %ymm4, %ymm4 ; AVX512VL-NEXT: vpsllvd %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm3, %ymm2, %k1 +; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v8i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpsubd %ymm4, %ymm6, %ymm4 +; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpsubd %ymm2, %ymm4, %ymm4 ; AVX512BW-NEXT: vpsllvd %ymm4, %ymm0, %ymm0 -; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: var_funnnel_v8i32: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm5 -; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32] -; AVX512VBMI2-NEXT: vpsubd %ymm4, %ymm6, %ymm4 +; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3 +; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32] +; AVX512VBMI2-NEXT: vpsubd %ymm2, %ymm4, %ymm4 ; AVX512VBMI2-NEXT: vpsllvd %ymm4, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: var_funnnel_v8i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %ymm4, %ymm6, %ymm4 +; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %ymm2, %ymm4, %ymm4 ; AVX512VLBW-NEXT: vpsllvd %ymm4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vptestnmd %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -516,48 +506,43 @@ define <16 x i16> @var_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; ; AVX512BW-LABEL: var_funnnel_v16i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %ymm4, %ymm6, %ymm4 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %ymm2, %ymm4, %ymm4 ; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: var_funnnel_v16i16: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VBMI2-NEXT: vpsubw %ymm4, %ymm6, %ymm4 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %ymm2, %ymm4, %ymm4 ; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: var_funnnel_v16i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %ymm4, %ymm6, %ymm4 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpsrlvw %ymm2, %ymm1, %ymm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %ymm2, %ymm4, %ymm4 ; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vptestnmw %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -796,79 +781,73 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; ; AVX512BW-LABEL: var_funnnel_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: var_funnnel_v32i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VBMI2-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: var_funnnel_v32i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VLBW-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512VLBW-NEXT: vptestnmb %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: var_funnnel_v32i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VLVBMI2-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512VLVBMI2-NEXT: vptestnmb %ymm3, %ymm2, %k1 +; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} ; AVX512VLVBMI2-NEXT: retq ; @@ -974,13 +953,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsllq %xmm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -988,14 +967,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; AVX512VL-LABEL: splatvar_funnnel_v4i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsllq %xmm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vptestnmq %ymm3, %ymm2, %k1 +; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq ; @@ -1004,13 +982,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsllq %xmm4, %ymm0, %ymm0 -; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1020,13 +998,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpsrlq %xmm4, %ymm1, %ymm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpsllq %xmm4, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1034,14 +1012,13 @@ define <4 x i64> @splatvar_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> % ; AVX512VLBW-LABEL: splatvar_funnnel_v4i64: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpsrlq %xmm2, %ymm1, %ymm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsllq %xmm4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vptestnmq %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -1147,15 +1124,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512F-NEXT: vpsrld %xmm5, %ymm1, %ymm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512F-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512F-NEXT: vpslld %xmm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq @@ -1163,16 +1140,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX512VL-LABEL: splatvar_funnnel_v8i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastd %xmm2, %ymm2 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VL-NEXT: vpsrld %xmm5, %ymm1, %ymm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VL-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VL-NEXT: vpslld %xmm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vptestnmd %ymm3, %ymm2, %k1 +; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq ; @@ -1181,15 +1157,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512BW-NEXT: vpsrld %xmm5, %ymm1, %ymm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512BW-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512BW-NEXT: vpslld %xmm4, %ymm0, %ymm0 -; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1199,15 +1175,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2 ; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VBMI2-NEXT: vpsrld %xmm5, %ymm1, %ymm5 -; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VBMI2-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VBMI2-NEXT: vpslld %xmm4, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1215,16 +1191,15 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % ; AVX512VLBW-LABEL: splatvar_funnnel_v8i32: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %ymm2 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VLBW-NEXT: vpsrld %xmm5, %ymm1, %ymm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VLBW-NEXT: vpsrld %xmm3, %ymm1, %ymm3 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VLBW-NEXT: vpslld %xmm4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vptestnmd %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -1368,16 +1343,15 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm5, %ymm1, %ymm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm4, %ymm0, %ymm0 -; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1386,16 +1360,15 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VBMI2-NEXT: vpsrlw %xmm5, %ymm1, %ymm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VBMI2-NEXT: vpsllw %xmm4, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1403,16 +1376,15 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; AVX512VLBW-LABEL: splatvar_funnnel_v16i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %ymm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm5, %ymm1, %ymm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm1, %ymm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsllw %xmm4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vptestnmw %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -1590,19 +1562,18 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0 +; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512BW-NEXT: retq @@ -1611,19 +1582,18 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VBMI2-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VBMI2-NEXT: retq @@ -1631,38 +1601,36 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VLBW-LABEL: splatvar_funnnel_v32i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VLBW-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512VLBW-NEXT: vptestnmb %ymm3, %ymm2, %k1 +; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4 -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero -; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512VLVBMI2-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero +; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero +; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4 ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero ; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 ; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512VLVBMI2-NEXT: vptestnmb %ymm3, %ymm2, %k1 +; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} ; AVX512VLVBMI2-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-fshr-512.ll b/test/CodeGen/X86/vector-fshr-512.ll index 063fd038f22..dd469fba1e0 100644 --- a/test/CodeGen/X86/vector-fshr-512.ll +++ b/test/CodeGen/X86/vector-fshr-512.ll @@ -18,40 +18,37 @@ declare <64 x i8> @llvm.fshr.v64i8(<64 x i8>, <64 x i8>, <64 x i8>) define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512F-NEXT: vpsrlvq %zmm4, %zmm1, %zmm5 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64] -; AVX512F-NEXT: vpsubq %zmm4, %zmm6, %zmm4 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512F-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64] +; AVX512F-NEXT: vpsubq %zmm2, %zmm4, %zmm4 ; AVX512F-NEXT: vpsllvq %zmm4, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v8i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VL-NEXT: vpsrlvq %zmm4, %zmm1, %zmm5 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64] -; AVX512VL-NEXT: vpsubq %zmm4, %zmm6, %zmm4 +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512VL-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64] +; AVX512VL-NEXT: vpsubq %zmm2, %zmm4, %zmm4 ; AVX512VL-NEXT: vpsllvq %zmm4, %zmm0, %zmm0 -; AVX512VL-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VL-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm5 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64] -; AVX512BW-NEXT: vpsubq %zmm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64] +; AVX512BW-NEXT: vpsubq %zmm2, %zmm4, %zmm4 ; AVX512BW-NEXT: vpsllvq %zmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq ; @@ -63,14 +60,13 @@ define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) ; ; AVX512VLBW-LABEL: var_funnnel_v8i64: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm5 -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64] -; AVX512VLBW-NEXT: vpsubq %zmm4, %zmm6, %zmm4 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64] +; AVX512VLBW-NEXT: vpsubq %zmm2, %zmm4, %zmm4 ; AVX512VLBW-NEXT: vpsllvq %zmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -86,40 +82,37 @@ define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512F-NEXT: vpsubd %zmm4, %zmm6, %zmm4 +; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512F-NEXT: vpsubd %zmm2, %zmm4, %zmm4 ; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm5, %zmm0, %zmm0 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v16i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512VL-NEXT: vpsubd %zmm4, %zmm6, %zmm4 +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512VL-NEXT: vpsubd %zmm2, %zmm4, %zmm4 ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0 -; AVX512VL-NEXT: vpord %zmm5, %zmm0, %zmm0 -; AVX512VL-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v16i32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpsubd %zmm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpsubd %zmm2, %zmm4, %zmm4 ; AVX512BW-NEXT: vpsllvd %zmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpord %zmm5, %zmm0, %zmm0 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpord %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq ; @@ -131,14 +124,13 @@ define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> % ; ; AVX512VLBW-LABEL: var_funnnel_v16i32: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %zmm4, %zmm6, %zmm4 +; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %zmm2, %zmm4, %zmm4 ; AVX512VLBW-NEXT: vpsllvd %zmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpord %zmm5, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vpord %zmm3, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -216,14 +208,13 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> % ; ; AVX512BW-LABEL: var_funnnel_v32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %zmm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %zmm2, %zmm4, %zmm4 ; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq ; @@ -235,14 +226,13 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> % ; ; AVX512VLBW-LABEL: var_funnnel_v32i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %zmm4, %zmm6, %zmm4 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3 +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %zmm2, %zmm4, %zmm4 ; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -394,28 +384,27 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; ; AVX512BW-LABEL: var_funnnel_v64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpsllw $5, %zmm4, %zmm5 -; AVX512BW-NEXT: vpaddb %zmm5, %zmm5, %zmm6 -; AVX512BW-NEXT: vpmovb2m %zmm6, %k1 -; AVX512BW-NEXT: vpmovb2m %zmm5, %k2 -; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm5 +; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm3 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm4 +; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 +; AVX512BW-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1} +; AVX512BW-NEXT: vpsrlw $2, %zmm3, %zmm5 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 +; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512BW-NEXT: vpsrlw $1, %zmm3, %zmm5 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 -; AVX512BW-NEXT: vpblendmb %zmm5, %zmm1, %zmm5 {%k2} -; AVX512BW-NEXT: vpsrlw $2, %zmm5, %zmm7 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7 -; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} -; AVX512BW-NEXT: vpsrlw $1, %zmm5, %zmm7 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7 -; AVX512BW-NEXT: vpaddb %zmm6, %zmm6, %zmm6 -; AVX512BW-NEXT: vpmovb2m %zmm6, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %zmm4, %zmm6, %zmm4 +; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %zmm2, %zmm4, %zmm4 ; AVX512BW-NEXT: vpsllw $5, %zmm4, %zmm4 -; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm6 -; AVX512BW-NEXT: vpmovb2m %zmm6, %k1 +; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm5 +; AVX512BW-NEXT: vpmovb2m %zmm5, %k1 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k2 ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm4 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 @@ -423,38 +412,37 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm4 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 ; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1} -; AVX512BW-NEXT: vpaddb %zmm6, %zmm6, %zmm4 +; AVX512BW-NEXT: vpaddb %zmm5, %zmm5, %zmm4 ; AVX512BW-NEXT: vpmovb2m %zmm4, %k1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} -; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: var_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: -; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VBMI2-NEXT: vpsllw $5, %zmm4, %zmm5 -; AVX512VBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm6 -; AVX512VBMI2-NEXT: vpmovb2m %zmm6, %k1 -; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k2 -; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm5 +; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm4 +; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1} +; AVX512VBMI2-NEXT: vpsrlw $2, %zmm3, %zmm5 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 +; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512VBMI2-NEXT: vpsrlw $1, %zmm3, %zmm5 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 -; AVX512VBMI2-NEXT: vpblendmb %zmm5, %zmm1, %zmm5 {%k2} -; AVX512VBMI2-NEXT: vpsrlw $2, %zmm5, %zmm7 -; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7 -; AVX512VBMI2-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} -; AVX512VBMI2-NEXT: vpsrlw $1, %zmm5, %zmm7 -; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7 -; AVX512VBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm6 -; AVX512VBMI2-NEXT: vpmovb2m %zmm6, %k1 -; AVX512VBMI2-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} -; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %zmm4, %zmm6, %zmm4 +; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %zmm2, %zmm4, %zmm4 ; AVX512VBMI2-NEXT: vpsllw $5, %zmm4, %zmm4 -; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm6 -; AVX512VBMI2-NEXT: vpmovb2m %zmm6, %k1 +; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm5 +; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k1 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k2 ; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm4 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 @@ -462,38 +450,37 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512VBMI2-NEXT: vpsllw $2, %zmm0, %zmm4 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1} -; AVX512VBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm4 +; AVX512VBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm4 ; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1 ; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} -; AVX512VBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: var_funnnel_v64i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpsllw $5, %zmm4, %zmm5 -; AVX512VLBW-NEXT: vpaddb %zmm5, %zmm5, %zmm6 -; AVX512VLBW-NEXT: vpmovb2m %zmm6, %k1 -; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k2 -; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm5 +; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm3 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm4 +; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1} +; AVX512VLBW-NEXT: vpsrlw $2, %zmm3, %zmm5 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 -; AVX512VLBW-NEXT: vpblendmb %zmm5, %zmm1, %zmm5 {%k2} -; AVX512VLBW-NEXT: vpsrlw $2, %zmm5, %zmm7 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7 -; AVX512VLBW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} -; AVX512VLBW-NEXT: vpsrlw $1, %zmm5, %zmm7 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7 -; AVX512VLBW-NEXT: vpaddb %zmm6, %zmm6, %zmm6 -; AVX512VLBW-NEXT: vpmovb2m %zmm6, %k1 -; AVX512VLBW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %zmm4, %zmm6, %zmm4 +; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512VLBW-NEXT: vpsrlw $1, %zmm3, %zmm5 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 +; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %zmm2, %zmm4, %zmm4 ; AVX512VLBW-NEXT: vpsllw $5, %zmm4, %zmm4 -; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm6 -; AVX512VLBW-NEXT: vpmovb2m %zmm6, %k1 +; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm5 +; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k1 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k2 ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm4 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 @@ -501,38 +488,37 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512VLBW-NEXT: vpsllw $2, %zmm0, %zmm4 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 ; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1} -; AVX512VLBW-NEXT: vpaddb %zmm6, %zmm6, %zmm4 +; AVX512VLBW-NEXT: vpaddb %zmm5, %zmm5, %zmm4 ; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1 ; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} -; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: var_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: -; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm4, %zmm5 -; AVX512VLVBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm6 -; AVX512VLVBMI2-NEXT: vpmovb2m %zmm6, %k1 -; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k2 -; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm5 +; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm4 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1} +; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm3, %zmm5 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 +; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm3, %zmm5 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5 -; AVX512VLVBMI2-NEXT: vpblendmb %zmm5, %zmm1, %zmm5 {%k2} -; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm5, %zmm7 -; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7 -; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} -; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm5, %zmm7 -; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7 -; AVX512VLVBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm6 -; AVX512VLVBMI2-NEXT: vpmovb2m %zmm6, %k1 -; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} -; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %zmm4, %zmm6, %zmm4 +; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1 +; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1} +; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %zmm2, %zmm4, %zmm4 ; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm4, %zmm4 -; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm6 -; AVX512VLVBMI2-NEXT: vpmovb2m %zmm6, %k1 +; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm5 +; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k1 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k2 ; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm4 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 @@ -540,11 +526,11 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm0, %zmm4 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1} -; AVX512VLVBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm4 +; AVX512VLVBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm4 ; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1 ; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1} -; AVX512VLVBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VLVBMI2-NEXT: retq %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) @@ -559,42 +545,39 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; AVX512F-LABEL: splatvar_funnnel_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512F-NEXT: vpsrlq %xmm2, %zmm1, %zmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpsllq %xmm4, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v8i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512VL-NEXT: vpsrlq %xmm2, %zmm1, %zmm3 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpsllq %xmm4, %zmm0, %zmm0 -; AVX512VL-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VL-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v8i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512BW-NEXT: vpsrlq %xmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpsllq %xmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq ; @@ -608,14 +591,13 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; AVX512VLBW-LABEL: splatvar_funnnel_v8i64: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2 -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64] -; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpsrlq %xmm2, %zmm1, %zmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64] +; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpsllq %xmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vptestnmq %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -634,48 +616,45 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i ; AVX512F-LABEL: splatvar_funnnel_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512F-NEXT: vpsrld %xmm5, %zmm1, %zmm5 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512F-NEXT: vpsrld %xmm3, %zmm1, %zmm3 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512F-NEXT: vpslld %xmm4, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm5, %zmm0, %zmm0 -; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0 +; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v16i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VL-NEXT: vpsrld %xmm5, %zmm1, %zmm5 -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VL-NEXT: vpsrld %xmm3, %zmm1, %zmm3 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VL-NEXT: vpslld %xmm4, %zmm0, %zmm0 -; AVX512VL-NEXT: vpord %zmm5, %zmm0, %zmm0 -; AVX512VL-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0 +; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v16i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512BW-NEXT: vpsrld %xmm5, %zmm1, %zmm5 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512BW-NEXT: vpsrld %xmm3, %zmm1, %zmm3 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512BW-NEXT: vpslld %xmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpord %zmm5, %zmm0, %zmm0 -; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpord %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq ; @@ -689,16 +668,15 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i ; AVX512VLBW-LABEL: splatvar_funnnel_v16i32: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero -; AVX512VLBW-NEXT: vpsrld %xmm5, %zmm1, %zmm5 -; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32] -; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero +; AVX512VLBW-NEXT: vpsrld %xmm3, %zmm1, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32] +; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX512VLBW-NEXT: vpslld %xmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpord %zmm5, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vptestnmd %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vpord %zmm3, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -757,16 +735,15 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512BW-LABEL: splatvar_funnnel_v32i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastw %xmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm5, %zmm1, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm3, %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq ; @@ -780,16 +757,15 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i ; AVX512VLBW-LABEL: splatvar_funnnel_v32i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm5, %zmm1, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16] -; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm1, %zmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero ; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vptestnmw %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} ; AVX512VLBW-NEXT: retq ; @@ -868,96 +844,92 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512BW-LABEL: splatvar_funnnel_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm5, %zmm1, %zmm6 -; AVX512BW-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 -; AVX512BW-NEXT: vpsrlw %xmm5, %zmm7, %zmm5 -; AVX512BW-NEXT: vpsrlw $8, %zmm5, %zmm5 -; AVX512BW-NEXT: vpbroadcastb %xmm5, %zmm5 -; AVX512BW-NEXT: vpandq %zmm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 +; AVX512BW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 +; AVX512BW-NEXT: vpsrlw %xmm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllw %xmm4, %zmm7, %zmm4 +; AVX512BW-NEXT: vpsllw %xmm4, %zmm5, %zmm4 ; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4 ; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq ; ; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VBMI2-NEXT: vpsrlw %xmm5, %zmm1, %zmm6 -; AVX512VBMI2-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 -; AVX512VBMI2-NEXT: vpsrlw %xmm5, %zmm7, %zmm5 -; AVX512VBMI2-NEXT: vpsrlw $8, %zmm5, %zmm5 -; AVX512VBMI2-NEXT: vpbroadcastb %xmm5, %zmm5 -; AVX512VBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 +; AVX512VBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 +; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm5, %zmm3 +; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm7, %zmm4 +; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm5, %zmm4 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4 ; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm5, %zmm1, %zmm6 -; AVX512VLBW-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 -; AVX512VLBW-NEXT: vpsrlw %xmm5, %zmm7, %zmm5 -; AVX512VLBW-NEXT: vpsrlw $8, %zmm5, %zmm5 -; AVX512VLBW-NEXT: vpbroadcastb %xmm5, %zmm5 -; AVX512VLBW-NEXT: vpandq %zmm5, %zmm6, %zmm5 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 +; AVX512VLBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 +; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm5, %zmm3 +; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm7, %zmm4 +; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm5, %zmm4 ; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4 ; AVX512VLBW-NEXT: vpandq %zmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8: ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLVBMI2-NEXT: vpsrlw %xmm5, %zmm1, %zmm6 -; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 -; AVX512VLVBMI2-NEXT: vpsrlw %xmm5, %zmm7, %zmm5 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm5, %zmm5 -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm5, %zmm5 -; AVX512VLVBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4 +; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 +; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 +; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm5, %zmm3 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 +; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero ; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm7, %zmm4 +; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm5, %zmm4 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4 ; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1 +; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VLVBMI2-NEXT: retq %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer diff --git a/test/CodeGen/X86/vector-lzcnt-512.ll b/test/CodeGen/X86/vector-lzcnt-512.ll index f7097133268..d5b7b7152aa 100644 --- a/test/CodeGen/X86/vector-lzcnt-512.ll +++ b/test/CodeGen/X86/vector-lzcnt-512.ll @@ -359,16 +359,15 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; ; AVX512BW-LABEL: testv32i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vptestnmb %zmm4, %zmm3, %k0 -; AVX512BW-NEXT: vpmovm2b %k0, %zmm5 -; AVX512BW-NEXT: vpandq %zmm5, %zmm2, %zmm2 -; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2 +; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 +; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm3, %zmm1, %zmm1 ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 @@ -442,16 +441,15 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; ; AVX512BW-LABEL: testv32i16u: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vptestnmb %zmm4, %zmm3, %k0 -; AVX512BW-NEXT: vpmovm2b %k0, %zmm5 -; AVX512BW-NEXT: vpandq %zmm5, %zmm2, %zmm2 -; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 -; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2 +; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 +; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm3, %zmm1, %zmm1 ; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 @@ -549,16 +547,15 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; ; AVX512BW-LABEL: testv64i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vptestnmb %zmm3, %zmm0, %k0 -; AVX512BW-NEXT: vpmovm2b %k0, %zmm4 -; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: testv64i8: @@ -640,16 +637,15 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; ; AVX512BW-LABEL: testv64i8u: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512BW-NEXT: vptestnmb %zmm3, %zmm0, %k0 -; AVX512BW-NEXT: vpmovm2b %k0, %zmm4 -; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3 +; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm1 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: testv64i8u: -- 2.50.1