return SDValue();
}
-static
-std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N, SelectionDAG &DAG) {
- SDLoc DL(N);
- EVT LoVT, HiVT;
- std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
-
- // Split the inputs.
- SDValue Lo, Hi, LL, LH, RL, RH;
- std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
- std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
-
- Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
- Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
-
- return std::make_pair(Lo, Hi);
-}
-
// This function assumes all the vselect's arguments are CONCAT_VECTOR
// nodes and that the condition is a BV of ConstantSDNodes (or undefs).
static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
SDValue Mask = MSC->getMask();
- SDValue Data = MSC->getValue();
SDValue Chain = MSC->getChain();
SDLoc DL(N);
if (ISD::isBuildVectorAllZeros(Mask.getNode()))
return Chain;
- if (Level >= AfterLegalizeTypes)
- return SDValue();
-
- // If the MSCATTER data type requires splitting and the mask is provided by a
- // SETCC, then split both nodes and its operands before legalization. This
- // prevents the type legalizer from unrolling SETCC into scalar comparisons
- // and enables future optimizations (e.g. min/max pattern matching on X86).
- if (Mask.getOpcode() != ISD::SETCC)
- return SDValue();
-
- // Check if any splitting is required.
- if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) !=
- TargetLowering::TypeSplitVector)
- return SDValue();
- SDValue MaskLo, MaskHi;
- std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
-
- EVT LoVT, HiVT;
- std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MSC->getValueType(0));
-
- EVT MemoryVT = MSC->getMemoryVT();
- unsigned Alignment = MSC->getOriginalAlignment();
-
- EVT LoMemVT, HiMemVT;
- std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
-
- SDValue DataLo, DataHi;
- std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
-
- SDValue Scale = MSC->getScale();
- SDValue BasePtr = MSC->getBasePtr();
- SDValue IndexLo, IndexHi;
- std::tie(IndexLo, IndexHi) = DAG.SplitVector(MSC->getIndex(), DL);
-
- MachineMemOperand *MMO = DAG.getMachineFunction().
- getMachineMemOperand(MSC->getPointerInfo(),
- MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
- Alignment, MSC->getAAInfo(), MSC->getRanges());
-
- SDValue OpsLo[] = { Chain, DataLo, MaskLo, BasePtr, IndexLo, Scale };
- SDValue Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
- DataLo.getValueType(), DL, OpsLo, MMO,
- MSC->getIndexType());
-
- // The order of the Scatter operation after split is well defined. The "Hi"
- // part comes after the "Lo". So these two operations should be chained one
- // after another.
- SDValue OpsHi[] = { Lo, DataHi, MaskHi, BasePtr, IndexHi, Scale };
- return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
- DL, OpsHi, MMO, MSC->getIndexType());
+ return SDValue();
}
SDValue DAGCombiner::visitMSTORE(SDNode *N) {
MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
SDValue Mask = MST->getMask();
- SDValue Data = MST->getValue();
SDValue Chain = MST->getChain();
- EVT VT = Data.getValueType();
SDLoc DL(N);
// Zap masked stores with a zero mask.
if (ISD::isBuildVectorAllZeros(Mask.getNode()))
return Chain;
- if (Level >= AfterLegalizeTypes)
- return SDValue();
-
- // If the MSTORE data type requires splitting and the mask is provided by a
- // SETCC, then split both nodes and its operands before legalization. This
- // prevents the type legalizer from unrolling SETCC into scalar comparisons
- // and enables future optimizations (e.g. min/max pattern matching on X86).
- if (Mask.getOpcode() == ISD::SETCC) {
- // Check if any splitting is required.
- if (TLI.getTypeAction(*DAG.getContext(), VT) !=
- TargetLowering::TypeSplitVector)
- return SDValue();
-
- SDValue MaskLo, MaskHi, Lo, Hi;
- std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
-
- SDValue Ptr = MST->getBasePtr();
-
- EVT MemoryVT = MST->getMemoryVT();
- unsigned Alignment = MST->getOriginalAlignment();
-
- EVT LoMemVT, HiMemVT;
- std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
-
- SDValue DataLo, DataHi;
- std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
-
- MachineMemOperand *MMO = DAG.getMachineFunction().
- getMachineMemOperand(MST->getPointerInfo(),
- MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
- Alignment, MST->getAAInfo(), MST->getRanges());
-
- Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
- MST->isTruncatingStore(),
- MST->isCompressingStore());
-
- Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
- MST->isCompressingStore());
- unsigned HiOffset = LoMemVT.getStoreSize();
-
- MMO = DAG.getMachineFunction().getMachineMemOperand(
- MST->getPointerInfo().getWithOffset(HiOffset),
- MachineMemOperand::MOStore, HiMemVT.getStoreSize(), Alignment,
- MST->getAAInfo(), MST->getRanges());
-
- Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
- MST->isTruncatingStore(),
- MST->isCompressingStore());
-
- AddToWorklist(Lo.getNode());
- AddToWorklist(Hi.getNode());
-
- return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
- }
return SDValue();
}
if (ISD::isBuildVectorAllZeros(Mask.getNode()))
return CombineTo(N, MGT->getPassThru(), MGT->getChain());
- if (Level >= AfterLegalizeTypes)
- return SDValue();
-
- // If the MGATHER result requires splitting and the mask is provided by a
- // SETCC, then split both nodes and its operands before legalization. This
- // prevents the type legalizer from unrolling SETCC into scalar comparisons
- // and enables future optimizations (e.g. min/max pattern matching on X86).
-
- if (Mask.getOpcode() != ISD::SETCC)
- return SDValue();
-
- EVT VT = N->getValueType(0);
-
- // Check if any splitting is required.
- if (TLI.getTypeAction(*DAG.getContext(), VT) !=
- TargetLowering::TypeSplitVector)
- return SDValue();
-
- SDValue MaskLo, MaskHi, Lo, Hi;
- std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
-
- SDValue PassThru = MGT->getPassThru();
- SDValue PassThruLo, PassThruHi;
- std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, DL);
-
- EVT LoVT, HiVT;
- std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
-
- SDValue Chain = MGT->getChain();
- EVT MemoryVT = MGT->getMemoryVT();
- unsigned Alignment = MGT->getOriginalAlignment();
-
- EVT LoMemVT, HiMemVT;
- std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
-
- SDValue Scale = MGT->getScale();
- SDValue BasePtr = MGT->getBasePtr();
- SDValue Index = MGT->getIndex();
- SDValue IndexLo, IndexHi;
- std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
-
- MachineMemOperand *MMO = DAG.getMachineFunction().
- getMachineMemOperand(MGT->getPointerInfo(),
- MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
- Alignment, MGT->getAAInfo(), MGT->getRanges());
-
- SDValue OpsLo[] = { Chain, PassThruLo, MaskLo, BasePtr, IndexLo, Scale };
- Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, DL, OpsLo,
- MMO, MGT->getIndexType());
-
- SDValue OpsHi[] = { Chain, PassThruHi, MaskHi, BasePtr, IndexHi, Scale };
- Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, DL, OpsHi,
- MMO, MGT->getIndexType());
-
- AddToWorklist(Lo.getNode());
- AddToWorklist(Hi.getNode());
-
- // Build a factor node to remember that this load is independent of the
- // other one.
- Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
- Hi.getValue(1));
-
- // Legalized the chain result - switch anything that used the old chain to
- // use the new one.
- DAG.ReplaceAllUsesOfValueWith(SDValue(MGT, 1), Chain);
-
- SDValue GatherRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
-
- SDValue RetOps[] = { GatherRes, Chain };
- return DAG.getMergeValues(RetOps, DL);
+ return SDValue();
}
SDValue DAGCombiner::visitMLOAD(SDNode *N) {
if (ISD::isBuildVectorAllZeros(Mask.getNode()))
return CombineTo(N, MLD->getPassThru(), MLD->getChain());
- if (Level >= AfterLegalizeTypes)
- return SDValue();
-
- // If the MLOAD result requires splitting and the mask is provided by a
- // SETCC, then split both nodes and its operands before legalization. This
- // prevents the type legalizer from unrolling SETCC into scalar comparisons
- // and enables future optimizations (e.g. min/max pattern matching on X86).
- if (Mask.getOpcode() == ISD::SETCC) {
- EVT VT = N->getValueType(0);
-
- // Check if any splitting is required.
- if (TLI.getTypeAction(*DAG.getContext(), VT) !=
- TargetLowering::TypeSplitVector)
- return SDValue();
-
- SDValue MaskLo, MaskHi, Lo, Hi;
- std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
-
- SDValue PassThru = MLD->getPassThru();
- SDValue PassThruLo, PassThruHi;
- std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, DL);
-
- EVT LoVT, HiVT;
- std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
-
- SDValue Chain = MLD->getChain();
- SDValue Ptr = MLD->getBasePtr();
- EVT MemoryVT = MLD->getMemoryVT();
- unsigned Alignment = MLD->getOriginalAlignment();
-
- EVT LoMemVT, HiMemVT;
- std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
-
- MachineMemOperand *MMO = DAG.getMachineFunction().
- getMachineMemOperand(MLD->getPointerInfo(),
- MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
- Alignment, MLD->getAAInfo(), MLD->getRanges());
-
- Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, PassThruLo, LoMemVT,
- MMO, ISD::NON_EXTLOAD, MLD->isExpandingLoad());
-
- Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
- MLD->isExpandingLoad());
- unsigned HiOffset = LoMemVT.getStoreSize();
-
- MMO = DAG.getMachineFunction().getMachineMemOperand(
- MLD->getPointerInfo().getWithOffset(HiOffset),
- MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), Alignment,
- MLD->getAAInfo(), MLD->getRanges());
-
- Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, PassThruHi, HiMemVT,
- MMO, ISD::NON_EXTLOAD, MLD->isExpandingLoad());
-
- AddToWorklist(Lo.getNode());
- AddToWorklist(Hi.getNode());
-
- // Build a factor node to remember that this load is independent of the
- // other one.
- Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
- Hi.getValue(1));
-
- // Legalized the chain result - switch anything that used the old chain to
- // use the new one.
- DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain);
-
- SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
-
- SDValue RetOps[] = { LoadRes, Chain };
- return DAG.getMergeValues(RetOps, DL);
- }
return SDValue();
}
void SplitRes_SELECT_CC (SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitRes_UNDEF (SDNode *N, SDValue &Lo, SDValue &Hi);
+ void SplitVSETCC(const SDNode *N);
+
//===--------------------------------------------------------------------===//
// Generic Expansion: LegalizeTypesGeneric.cpp
//===--------------------------------------------------------------------===//
// Split Mask operand
SDValue MaskLo, MaskHi;
- if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
- GetSplitVector(Mask, MaskLo, MaskHi);
- else
- std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+ if (Mask.getOpcode() == ISD::SETCC) {
+ SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+ } else {
+ if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+ GetSplitVector(Mask, MaskLo, MaskHi);
+ else
+ std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+ }
EVT MemoryVT = MLD->getMemoryVT();
EVT LoMemVT, HiMemVT;
// Split Mask operand
SDValue MaskLo, MaskHi;
- if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
- GetSplitVector(Mask, MaskLo, MaskHi);
- else
- std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+ if (Mask.getOpcode() == ISD::SETCC) {
+ SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+ } else {
+ if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+ GetSplitVector(Mask, MaskLo, MaskHi);
+ else
+ std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+ }
EVT MemoryVT = MGT->getMemoryVT();
EVT LoMemVT, HiMemVT;
else
std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
+ // Split Mask operand
SDValue MaskLo, MaskHi;
- if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
- // Split Mask operand
- GetSplitVector(Mask, MaskLo, MaskHi);
- else
- std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+ if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) {
+ SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+ } else {
+ if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+ GetSplitVector(Mask, MaskLo, MaskHi);
+ else
+ std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+ }
SDValue Lo, Hi;
MachineMemOperand *MMO = DAG.getMachineFunction().
else
std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
+ // Split Mask operand
SDValue MaskLo, MaskHi;
- if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
- // Split Mask operand
- GetSplitVector(Mask, MaskLo, MaskHi);
- else
- std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+ if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) {
+ SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+ } else {
+ if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+ GetSplitVector(Mask, MaskLo, MaskHi);
+ else
+ std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+ }
SDValue IndexHi, IndexLo;
if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
define <16 x double> @test_gather_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %passthru) {
; KNL_64-LABEL: test_gather_setcc_split:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm4
-; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm5
-; KNL_64-NEXT: vptestnmd %zmm5, %zmm5, %k1
+; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; KNL_64-NEXT: vptestnmd %zmm4, %zmm4, %k1
; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
-; KNL_64-NEXT: vgatherdpd (%rdi,%ymm4,8), %zmm3 {%k1}
+; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm3 {%k1}
; KNL_64-NEXT: vmovapd %zmm2, %zmm0
; KNL_64-NEXT: vmovapd %zmm3, %zmm1
; KNL_64-NEXT: retq
; KNL_32-NEXT: subl $64, %esp
; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3
; KNL_32-NEXT: movl 8(%ebp), %eax
-; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
-; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5
-; KNL_32-NEXT: vptestnmd %zmm5, %zmm5, %k1
+; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; KNL_32-NEXT: vptestnmd %zmm4, %zmm4, %k1
; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2
; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
-; KNL_32-NEXT: vgatherdpd (%eax,%ymm4,8), %zmm3 {%k1}
+; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1}
; KNL_32-NEXT: vmovapd %zmm2, %zmm0
; KNL_32-NEXT: vmovapd %zmm3, %zmm1
; KNL_32-NEXT: movl %ebp, %esp
;
; SKX-LABEL: test_gather_setcc_split:
; SKX: # %bb.0:
-; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm4
-; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm5
-; SKX-NEXT: vptestnmd %ymm5, %ymm5, %k1
+; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; SKX-NEXT: vptestnmd %ymm4, %ymm4, %k1
; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2}
-; SKX-NEXT: vgatherdpd (%rdi,%ymm4,8), %zmm3 {%k1}
+; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm3 {%k1}
; SKX-NEXT: vmovapd %zmm2, %zmm0
; SKX-NEXT: vmovapd %zmm3, %zmm1
; SKX-NEXT: retq
; SKX_32-NEXT: subl $64, %esp
; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
; SKX_32-NEXT: movl 8(%ebp), %eax
-; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
-; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5
-; SKX_32-NEXT: vptestnmd %ymm5, %ymm5, %k1
+; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; SKX_32-NEXT: vptestnmd %ymm4, %ymm4, %k1
; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2}
-; SKX_32-NEXT: vgatherdpd (%eax,%ymm4,8), %zmm3 {%k1}
+; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1}
; SKX_32-NEXT: vmovapd %zmm2, %zmm0
; SKX_32-NEXT: vmovapd %zmm3, %zmm1
; SKX_32-NEXT: movl %ebp, %esp
define void @test_scatter_setcc_split(double* %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %src0) {
; KNL_64-LABEL: test_scatter_setcc_split:
; KNL_64: # %bb.0:
-; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm4
-; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm5
-; KNL_64-NEXT: vptestnmd %zmm5, %zmm5, %k1
+; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; KNL_64-NEXT: vptestnmd %zmm4, %zmm4, %k1
; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2
; KNL_64-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
-; KNL_64-NEXT: vscatterdpd %zmm3, (%rdi,%ymm4,8) {%k1}
+; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; KNL_64-NEXT: vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1}
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-NEXT: subl $64, %esp
; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3
; KNL_32-NEXT: movl 8(%ebp), %eax
-; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
-; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5
-; KNL_32-NEXT: vptestnmd %zmm5, %zmm5, %k1
+; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; KNL_32-NEXT: vptestnmd %zmm4, %zmm4, %k1
; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2
; KNL_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
-; KNL_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm4,8) {%k1}
+; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1}
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
; KNL_32-NEXT: .cfi_def_cfa %esp, 4
;
; SKX-LABEL: test_scatter_setcc_split:
; SKX: # %bb.0:
-; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm4
-; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm5
-; SKX-NEXT: vptestnmd %ymm5, %ymm5, %k1
+; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; SKX-NEXT: vptestnmd %ymm4, %ymm4, %k1
; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
; SKX-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
-; SKX-NEXT: vscatterdpd %zmm3, (%rdi,%ymm4,8) {%k1}
+; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; SKX-NEXT: vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-NEXT: subl $64, %esp
; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
; SKX_32-NEXT: movl 8(%ebp), %eax
-; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm4
-; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm5
-; SKX_32-NEXT: vptestnmd %ymm5, %ymm5, %k1
+; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; SKX_32-NEXT: vptestnmd %ymm4, %ymm4, %k1
; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
; SKX_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
-; SKX_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm4,8) {%k1}
+; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; SKX_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
; SKX_32-NEXT: .cfi_def_cfa %esp, 4
; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3
; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxwd %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
; AVX2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
; AVX2-NEXT: vmaskmovpd 32(%rdi), %ymm3, %ymm1
; AVX1-NEXT: vpmovsxdq %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT: vpmovsxwq %xmm0, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm4
; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
; AVX2-NEXT: vpmovsxwd %xmm3, %xmm3
; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3
; AVX2-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxwd %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
+; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm4
; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm1, %ymm0
; AVX2-NEXT: vpmaskmovq 32(%rdi), %ymm3, %ymm1