if (HasInt256) {
// Custom legalize 2x32 to get a little better code.
setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
+ setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
unsigned NumElts = VT.getVectorNumElements();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
+ // If the index is v2i32, we're being called by type legalization.
+ if (IndexVT == MVT::v2i32)
+ return SDValue();
+
if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
!Index.getSimpleValueType().is512BitVector()) {
// AVX512F supports only 512-bit vectors. Or data or index should
SDValue RetOps[] = {Extract, NewGather.getValue(2)};
return DAG.getMergeValues(RetOps, dl);
}
- if (N->getMemoryVT() == MVT::v2i32) {
- // There is a special case when the return type is v2i32 is illegal and
- // the type legaizer extended it to v2i64. Without this conversion we end up
- // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
- // In order to avoid this situation, we'll build an X86 specific Gather node
- // with index v2i64 and value type v4i32.
- assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
- "Unexpected type in masked gather");
- Src0 =
- DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src0),
- DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
- // The mask should match the destination type. Extending mask with zeroes
- // is not necessary since instruction itself reads only two values from
- // memory.
- SDVTList VTList;
- if (Subtarget.hasVLX()) {
- Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
- VTList = DAG.getVTList(MVT::v4i32, MVT::v2i1, MVT::Other);
- } else {
- Mask =
- DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Mask),
- DAG.getUNDEF(MVT::v4i32), {0, 2, -1, -1});
- VTList = DAG.getVTList(MVT::v4i32, MVT::v4i32, MVT::Other);
- }
- SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
- SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- VTList, Ops, dl, N->getMemoryVT(), N->getMemOperand());
-
- SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
- NewGather.getValue(0), DAG);
- SDValue RetOps[] = { Sext, NewGather.getValue(2) };
- return DAG.getMergeValues(RetOps, dl);
- }
SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
Results.push_back(Res.getValue(2));
return;
}
+ if (VT == MVT::v2i32) {
+ auto *Gather = cast<MaskedGatherSDNode>(N);
+ SDValue Index = Gather->getIndex();
+ SDValue Mask = Gather->getMask();
+ assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
+ SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
+ Gather->getValue(),
+ DAG.getUNDEF(MVT::v2i32));
+ // If the index is v2i64 we can use it directly.
+ if (Index.getValueType() == MVT::v2i64 &&
+ (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
+ if (!Subtarget.hasVLX()) {
+ // We need to widen the mask, but the instruction will only use 2
+ // of its elements. So we can use undef.
+ Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
+ DAG.getUNDEF(MVT::v2i1));
+ Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
+ }
+ SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
+ Index };
+ SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
+ Gather->getMemoryVT(), Gather->getMemOperand());
+ SDValue Chain = Res.getValue(2);
+ if (!ExperimentalVectorWideningLegalization)
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
+ DAG.getIntPtrConstant(0, dl));
+ Results.push_back(Res);
+ Results.push_back(Chain);
+ return;
+ }
+ EVT IndexVT = Index.getValueType();
+ EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
+ IndexVT.getScalarType(), 4);
+ // Otherwise we need to custom widen everything to avoid promotion.
+ Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
+ DAG.getUNDEF(IndexVT));
+ Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
+ DAG.getConstant(0, dl, MVT::v2i1));
+ SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
+ Index };
+ SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
+ Gather->getMemoryVT(), dl, Ops,
+ Gather->getMemOperand());
+ SDValue Chain = Res.getValue(1);
+ if (!ExperimentalVectorWideningLegalization)
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
+ DAG.getIntPtrConstant(0, dl));
+ Results.push_back(Res);
+ Results.push_back(Chain);
+ return;
+ }
break;
}
}
define <2 x i32> @masked_gather_v2i32(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) {
; X86-LABEL: masked_gather_v2i32:
; X86: # BB#0: # %entry
-; X86-NEXT: vpsllq $63, %xmm0, %xmm0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: vpmovsxdq (%eax), %xmm2
+; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
-; X86-NEXT: vpmovsxdq %xmm1, %xmm0
+; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X86-NEXT: vpslld $31, %xmm0, %xmm0
+; X86-NEXT: vpgatherdd %xmm0, (,%xmm2), %xmm1
+; X86-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; X86-NEXT: retl
;
; X64-LABEL: masked_gather_v2i32:
; X64: # BB#0: # %entry
-; X64-NEXT: vpsllq $63, %xmm0, %xmm0
; X64-NEXT: vmovdqa (%rdi), %xmm2
; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: vpslld $31, %xmm0, %xmm0
; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
-; X64-NEXT: vpmovsxdq %xmm1, %xmm0
+; X64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; X64-NEXT: retq
;
; NOGATHER-LABEL: masked_gather_v2i32:
define <4 x i32> @masked_gather_v2i32_concat(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) {
; X86-LABEL: masked_gather_v2i32_concat:
; X86: # BB#0: # %entry
-; X86-NEXT: vpsllq $63, %xmm0, %xmm0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: vpmovsxdq (%eax), %xmm2
+; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
-; X86-NEXT: vpmovsxdq %xmm1, %xmm0
-; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X86-NEXT: vpslld $31, %xmm0, %xmm0
+; X86-NEXT: vpgatherdd %xmm0, (,%xmm2), %xmm1
+; X86-NEXT: vmovdqa %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: masked_gather_v2i32_concat:
; X64: # BB#0: # %entry
-; X64-NEXT: vpsllq $63, %xmm0, %xmm0
; X64-NEXT: vmovdqa (%rdi), %xmm2
; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: vpslld $31, %xmm0, %xmm0
; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
-; X64-NEXT: vpmovsxdq %xmm1, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: vmovdqa %xmm1, %xmm0
; X64-NEXT: retq
;
; NOGATHER-LABEL: masked_gather_v2i32_concat:
define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
; KNL_64-LABEL: test23:
; KNL_64: # BB#0:
-; KNL_64-NEXT: # kill: %xmm2<def> %xmm2<kill> %zmm2<def>
-; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0
-; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0
-; KNL_64-NEXT: vmovdqa %xmm1, %xmm1
-; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
-; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
-; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
-; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
+; KNL_64-NEXT: vmovaps %xmm1, %xmm1
+; KNL_64-NEXT: vpslld $31, %ymm1, %ymm1
+; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
+; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test23:
; KNL_32: # BB#0:
-; KNL_32-NEXT: # kill: %xmm2<def> %xmm2<kill> %zmm2<def>
-; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0
-; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0
-; KNL_32-NEXT: vmovdqa %xmm1, %xmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
-; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
-; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
-; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
+; KNL_32-NEXT: vmovaps %xmm1, %xmm1
+; KNL_32-NEXT: vpslld $31, %ymm1, %ymm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
+; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test23:
; SKX: # BB#0:
-; SKX-NEXT: vpsllq $32, %xmm0, %xmm0
-; SKX-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1}
-; SKX-NEXT: vpmovsxdq %xmm1, %xmm0
+; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
+; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SKX-NEXT: retq
;
; SKX_32-LABEL: test23:
; SKX_32: # BB#0:
-; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0
-; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1}
-; SKX_32-NEXT: vpmovsxdq %xmm1, %xmm0
+; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1}
+; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
define <2 x i32> @test23b(i32* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %src0) {
; KNL_64-LABEL: test23b:
; KNL_64: # BB#0:
-; KNL_64-NEXT: # kill: %xmm2<def> %xmm2<kill> %zmm2<def>
; KNL_64-NEXT: # kill: %xmm0<def> %xmm0<kill> %zmm0<def>
-; KNL_64-NEXT: vmovdqa %xmm1, %xmm1
-; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
-; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
-; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
-; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
+; KNL_64-NEXT: vmovaps %xmm1, %xmm1
+; KNL_64-NEXT: vpslld $31, %ymm1, %ymm1
+; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
+; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test23b:
; KNL_32: # BB#0:
-; KNL_32-NEXT: # kill: %xmm2<def> %xmm2<kill> %zmm2<def>
; KNL_32-NEXT: # kill: %xmm0<def> %xmm0<kill> %zmm0<def>
-; KNL_32-NEXT: vmovdqa %xmm1, %xmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
-; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
-; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
-; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
+; KNL_32-NEXT: vmovaps %xmm1, %xmm1
+; KNL_32-NEXT: vpslld $31, %ymm1, %ymm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
+; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1}
-; SKX-NEXT: vpmovsxdq %xmm1, %xmm0
+; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SKX-NEXT: retq
;
; SKX_32-LABEL: test23b:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1}
-; SKX_32-NEXT: vpmovsxdq %xmm1, %xmm0
+; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SKX_32-NEXT: retl
%gep.random = getelementptr i32, i32* %base, <2 x i64> %ind
%res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
; KNL_64-LABEL: test24:
; KNL_64: # BB#0:
-; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0
-; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm1
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_64-NEXT: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
-; KNL_64-NEXT: vpgatherqq (%rdi,%zmm1,8), %zmm0 {%k1}
-; KNL_64-NEXT: # kill: %xmm0<def> %xmm0<kill> %zmm0<kill>
+; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k1}
+; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test24:
; KNL_32: # BB#0:
-; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0
-; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vmovdqa {{.*#+}} xmm0 = [1,0,1,0]
-; KNL_32-NEXT: vpsllq $63, %zmm0, %zmm0
-; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL_32-NEXT: vpgatherqq (%eax,%zmm1,8), %zmm0 {%k1}
-; KNL_32-NEXT: # kill: %xmm0<def> %xmm0<kill> %zmm0<kill>
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT: movb $3, %cl
+; KNL_32-NEXT: kmovw %ecx, %k1
+; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k1}
+; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test24:
; SKX: # BB#0:
-; SKX-NEXT: vpsllq $32, %xmm0, %xmm0
-; SKX-NEXT: vpsraq $32, %xmm0, %xmm0
-; SKX-NEXT: kxnorw %k0, %k0, %k1
-; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1}
-; SKX-NEXT: vpmovsxdq %xmm1, %xmm0
+; SKX-NEXT: movb $3, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
+; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SKX-NEXT: retq
;
; SKX_32-LABEL: test24:
; SKX_32: # BB#0:
-; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0
-; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: kxnorw %k0, %k0, %k1
-; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1}
-; SKX_32-NEXT: vpmovsxdq %xmm1, %xmm0
+; SKX_32-NEXT: movb $3, %cl
+; SKX_32-NEXT: kmovw %ecx, %k1
+; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1}
+; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind