From: Simon Pilgrim Date: Fri, 15 Mar 2019 19:14:28 +0000 (+0000) Subject: [X86][SSE] Fold scalar_to_vector(i64 anyext(x)) -> bitcast(scalar_to_vector(i32 anyex... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e767531b14930999fa85c0d524faaf5969d56a9c;p=llvm [X86][SSE] Fold scalar_to_vector(i64 anyext(x)) -> bitcast(scalar_to_vector(i32 anyext(x))) Reduce the size of an any-extended i64 scalar_to_vector source to i32 - the any_extend nodes are often introduced by SimplifyDemandedBits. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@356292 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d862469db84..713ea356584 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -42402,6 +42402,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); + SDLoc DL(N); // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and. // This occurs frequently in our masked scalar intrinsic code and our @@ -42410,7 +42411,7 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse()) if (auto *C = dyn_cast(Src.getOperand(1))) if (C->getAPIntValue().isOneValue()) - return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1, + return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0)); // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec. @@ -42419,8 +42420,17 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1) if (auto *C = dyn_cast(Src.getOperand(1))) if (C->isNullValue()) - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, - Src.getOperand(0), Src.getOperand(1)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0), + Src.getOperand(1)); + + // Reduce v2i64 to v4i32 if we don't need the upper bits. + // TODO: Move to DAGCombine? + if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND && + Src.getValueType() == MVT::i64 && Src.hasOneUse() && + Src.getOperand(0).getScalarValueSizeInBits() <= 32) + return DAG.getBitcast( + VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, + DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32))); return SDValue(); } diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll index b1a63ffedf3..6fa470a06ef 100644 --- a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -13,8 +13,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; SSE2-SSSE3-LABEL: ext_i2_2i64: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: # kill: def $edi killed $edi def $rdi -; SSE2-SSSE3-NEXT: movq %rdi, %xmm0 +; SSE2-SSSE3-NEXT: movd %edi, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [1,2] ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 @@ -25,8 +24,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; ; AVX1-LABEL: ext_i2_2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $edi killed $edi def $rdi -; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -35,8 +33,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; ; AVX2-LABEL: ext_i2_2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $edi killed $edi def $rdi -; AVX2-NEXT: vmovq %rdi, %xmm0 +; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -190,8 +187,7 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) { define <4 x i64> @ext_i4_4i64(i4 %a0) { ; SSE2-SSSE3-LABEL: ext_i4_4i64: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: # kill: def $edi killed $edi def $rdi -; SSE2-SSSE3-NEXT: movq %rdi, %xmm0 +; SSE2-SSSE3-NEXT: movd %edi, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [1,2] ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 @@ -208,8 +204,7 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) { ; ; AVX1-LABEL: ext_i4_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $edi killed $edi def $rdi -; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 @@ -408,8 +403,7 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) { define <8 x i64> @ext_i8_8i64(i8 %a0) { ; SSE2-SSSE3-LABEL: ext_i8_8i64: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: # kill: def $edi killed $edi def $rdi -; SSE2-SSSE3-NEXT: movq %rdi, %xmm0 +; SSE2-SSSE3-NEXT: movd %edi, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,1] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [1,2] ; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm1 @@ -438,8 +432,7 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) { ; ; AVX1-LABEL: ext_i8_8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $edi killed $edi def $rdi -; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm0 diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index c524021866d..e9886a12bd6 100644 --- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -14,8 +14,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; SSE2-SSSE3-LABEL: ext_i2_2i64: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: # kill: def $edi killed $edi def $rdi -; SSE2-SSSE3-NEXT: movq %rdi, %xmm0 +; SSE2-SSSE3-NEXT: movd %edi, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [1,2] ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 @@ -27,8 +26,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; ; AVX1-LABEL: ext_i2_2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $edi killed $edi def $rdi -; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -38,8 +36,7 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) { ; ; AVX2-LABEL: ext_i2_2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $edi killed $edi def $rdi -; AVX2-NEXT: vmovq %rdi, %xmm0 +; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -244,8 +241,7 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) { define <4 x i64> @ext_i4_4i64(i4 %a0) { ; SSE2-SSSE3-LABEL: ext_i4_4i64: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: # kill: def $edi killed $edi def $rdi -; SSE2-SSSE3-NEXT: movq %rdi, %xmm0 +; SSE2-SSSE3-NEXT: movd %edi, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [1,2] ; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 @@ -264,8 +260,7 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) { ; ; AVX1-LABEL: ext_i4_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $edi killed $edi def $rdi -; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 @@ -527,8 +522,7 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) { define <8 x i64> @ext_i8_8i64(i8 %a0) { ; SSE2-SSSE3-LABEL: ext_i8_8i64: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: # kill: def $edi killed $edi def $rdi -; SSE2-SSSE3-NEXT: movq %rdi, %xmm0 +; SSE2-SSSE3-NEXT: movd %edi, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,1] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [1,2] ; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm1 @@ -561,8 +555,7 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) { ; ; AVX1-LABEL: ext_i8_8i64: ; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $edi killed $edi def $rdi -; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 ; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm0 diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll index 6a8726b3a2a..5a2b4b2494e 100644 --- a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll @@ -8,8 +8,7 @@ define <2 x i1> @bitcast_i2_2i1(i2 zeroext %a0) { ; SSE2-SSSE3-LABEL: bitcast_i2_2i1: ; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: # kill: def $edi killed $edi def $rdi -; SSE2-SSSE3-NEXT: movq %rdi, %xmm0 +; SSE2-SSSE3-NEXT: movd %edi, %xmm0 ; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [1,2] ; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1 @@ -21,8 +20,7 @@ define <2 x i1> @bitcast_i2_2i1(i2 zeroext %a0) { ; ; AVX1-LABEL: bitcast_i2_2i1: ; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $edi killed $edi def $rdi -; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -32,8 +30,7 @@ define <2 x i1> @bitcast_i2_2i1(i2 zeroext %a0) { ; ; AVX2-LABEL: bitcast_i2_2i1: ; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $edi killed $edi def $rdi -; AVX2-NEXT: vmovq %rdi, %xmm0 +; AVX2-NEXT: vmovd %edi, %xmm0 ; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/known-signbits-vector.ll b/test/CodeGen/X86/known-signbits-vector.ll index 719013f2714..d2135a6a3bb 100644 --- a/test/CodeGen/X86/known-signbits-vector.ll +++ b/test/CodeGen/X86/known-signbits-vector.ll @@ -244,12 +244,11 @@ define float @signbits_ashr_sext_sextinreg_and_extract_sitofp(<2 x i64> %a0, <2 ; ; X64-LABEL: signbits_ashr_sext_sextinreg_and_extract_sitofp: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: vpsrlq $61, %xmm0, %xmm0 ; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [4,8] ; X64-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; X64-NEXT: vmovq %rdi, %xmm1 +; X64-NEXT: vmovd %edi, %xmm1 ; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0 diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll index b0637bd4ab4..3f8bc4adb1b 100644 --- a/test/CodeGen/X86/vec_insert-5.ll +++ b/test/CodeGen/X86/vec_insert-5.ll @@ -17,9 +17,8 @@ define void @t1(i32 %a, x86_mmx* %P) nounwind { ; ; X64-LABEL: t1: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: shll $12, %edi -; X64-NEXT: movq %rdi, %xmm0 +; X64-NEXT: movd %edi, %xmm0 ; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: movq %xmm0, (%rsi) diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll index f29e53d0143..ce0ff7e0a64 100644 --- a/test/CodeGen/X86/vselect.ll +++ b/test/CodeGen/X86/vselect.ll @@ -567,11 +567,10 @@ define <2 x i64> @shrunkblend_nonvselectuse(<2 x i1> %cond, <2 x i64> %a, <2 x i define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) { ; SSE2-LABEL: simplify_select: ; SSE2: # %bb.0: -; SSE2-NEXT: # kill: def $edi killed $edi def $rdi ; SSE2-NEXT: psllq $63, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: movq %rdi, %xmm1 +; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] @@ -582,15 +581,13 @@ define <2 x i32> @simplify_select(i32 %x, <2 x i1> %z) { ; ; SSE41-LABEL: simplify_select: ; SSE41: # %bb.0: -; SSE41-NEXT: # kill: def $edi killed $edi def $rdi -; SSE41-NEXT: movq %rdi, %xmm0 +; SSE41-NEXT: movd %edi, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE41-NEXT: retq ; ; AVX1-LABEL: simplify_select: ; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $edi killed $edi def $rdi -; AVX1-NEXT: vmovq %rdi, %xmm0 +; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; diff --git a/test/CodeGen/X86/widen_conv-3.ll b/test/CodeGen/X86/widen_conv-3.ll index 33c81886faf..3525703b921 100644 --- a/test/CodeGen/X86/widen_conv-3.ll +++ b/test/CodeGen/X86/widen_conv-3.ll @@ -92,7 +92,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) ; X64-SSE2-LABEL: convert_v3i8_to_v3f32: ; X64-SSE2: # %bb.0: # %entry ; X64-SSE2-NEXT: movzwl (%rsi), %eax -; X64-SSE2-NEXT: movq %rax, %xmm0 +; X64-SSE2-NEXT: movd %eax, %xmm0 ; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; X64-SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) @@ -116,7 +116,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) ; X64-SSE42: # %bb.0: # %entry ; X64-SSE42-NEXT: movzbl 2(%rsi), %eax ; X64-SSE42-NEXT: movzwl (%rsi), %ecx -; X64-SSE42-NEXT: movq %rcx, %xmm0 +; X64-SSE42-NEXT: movd %ecx, %xmm0 ; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0 ; X64-SSE42-NEXT: pslld $24, %xmm0 diff --git a/test/CodeGen/X86/widen_conv-4.ll b/test/CodeGen/X86/widen_conv-4.ll index 5bdd7562d1e..e807d6f4043 100644 --- a/test/CodeGen/X86/widen_conv-4.ll +++ b/test/CodeGen/X86/widen_conv-4.ll @@ -124,7 +124,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) ; X64-SSE2-LABEL: convert_v3i8_to_v3f32: ; X64-SSE2: # %bb.0: # %entry ; X64-SSE2-NEXT: movzwl (%rsi), %eax -; X64-SSE2-NEXT: movq %rax, %xmm0 +; X64-SSE2-NEXT: movd %eax, %xmm0 ; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; X64-SSE2-NEXT: movzbl 2(%rsi), %eax @@ -144,7 +144,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) ; X64-SSE42: # %bb.0: # %entry ; X64-SSE42-NEXT: movzbl 2(%rsi), %eax ; X64-SSE42-NEXT: movzwl (%rsi), %ecx -; X64-SSE42-NEXT: movq %rcx, %xmm0 +; X64-SSE42-NEXT: movd %ecx, %xmm0 ; X64-SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X64-SSE42-NEXT: pinsrd $2, %eax, %xmm0 ; X64-SSE42-NEXT: pand {{.*}}(%rip), %xmm0