From: Simon Pilgrim Date: Thu, 9 Feb 2017 11:50:19 +0000 (+0000) Subject: [X86][SSE] Attempt to break register dependencies during lowerBuildVector X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=85a8f6df5abe03373d1f17e6f0f5cb809a8cb750;p=llvm [X86][SSE] Attempt to break register dependencies during lowerBuildVector LowerBuildVectorv16i8/LowerBuildVectorv8i16 insert values into a UNDEF vector if the build vector doesn't contain any zero elements, resulting in register dependencies with a previous use of the register. This patch attempts to break the register dependency by either always zeroing the vector before hand or (if we're inserting to the 0'th element) by using VZEXT_MOVL(SCALAR_TO_VECTOR(i32 AEXT(Elt))) which lowers to (V)MOVD and performs a similar function. Additionally (V)MOVD is a shorter instruction than PINSRB/PINSRW. We already do something similar for SSE41 PINSRD. On pre-SSE41 LowerBuildVectorv16i8 we go a little further and use VZEXT_MOVL(SCALAR_TO_VECTOR(i32 ZEXT(Elt))) if the build vector contains zeros to avoid the vector zeroing at the cost of a scalar zero extension, which can probably be brought over to the other cases in a future patch in some cases (load folding etc.) Differential Revision: https://reviews.llvm.org/D29720 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@294581 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 807acd5b5f8..6f9f6771e33 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5942,12 +5942,21 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, for (unsigned i = 0; i < 16; ++i) { bool IsNonZero = (NonZeros & (1 << i)) != 0; if (IsNonZero) { + // If the build vector contains zeros or our first insertion is not the + // first index then insert into zero vector to break any register + // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL. if (First) { - if (NumZero) - V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); - else - V = DAG.getUNDEF(MVT::v16i8); First = false; + if (NumZero || 0 != i) + V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); + else { + assert(0 == i && "Expected insertion into zero-index"); + V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); + V = DAG.getBitcast(MVT::v16i8, V); + continue; + } } V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V, Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); @@ -5969,6 +5978,8 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, } if ((i & 1) != 0) { + // FIXME: Investigate extending to i32 instead of just i16. + // FIXME: Investigate combining the first 4 bytes as a i32 instead. SDValue ThisElt, LastElt; bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0; if (LastIsNonZero) { @@ -5984,9 +5995,18 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, } else ThisElt = LastElt; - if (ThisElt) - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, - DAG.getIntPtrConstant(i / 2, dl)); + if (ThisElt) { + if (1 == i) { + V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32) + : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32); + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); + V = DAG.getBitcast(MVT::v8i16, V); + } else { + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, + DAG.getIntPtrConstant(i / 2, dl)); + } + } } } @@ -6007,12 +6027,21 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, for (unsigned i = 0; i < 8; ++i) { bool IsNonZero = (NonZeros & (1 << i)) != 0; if (IsNonZero) { + // If the build vector contains zeros or our first insertion is not the + // first index then insert into zero vector to break any register + // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL. if (First) { - if (NumZero) - V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); - else - V = DAG.getUNDEF(MVT::v8i16); First = false; + if (NumZero || 0 != i) + V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); + else { + assert(0 == i && "Expected insertion into zero-index"); + V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); + V = DAG.getBitcast(MVT::v8i16, V); + continue; + } } V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 1213fb1ec66..f889526baa3 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -1062,7 +1062,7 @@ define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-NEXT: vpcmpordq %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k4, %eax ; CHECK-NEXT: kmovw %k3, %ecx -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %ecx, %xmm0 ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; CHECK-NEXT: kmovw %k5, %eax ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 @@ -1110,7 +1110,7 @@ define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-NEXT: vpcmpordq %zmm1, %zmm0, %k3 {%k3} ; CHECK-NEXT: kmovw %k5, %eax ; CHECK-NEXT: kmovw %k4, %ecx -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %ecx, %xmm0 ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; CHECK-NEXT: kmovw %k6, %eax ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 @@ -1159,7 +1159,7 @@ define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) { ; CHECK-NEXT: vpcmporduq %zmm1, %zmm0, %k0 ; CHECK-NEXT: kmovw %k4, %eax ; CHECK-NEXT: kmovw %k3, %ecx -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %ecx, %xmm0 ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; CHECK-NEXT: kmovw %k5, %eax ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 @@ -1207,7 +1207,7 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) { ; CHECK-NEXT: vpcmporduq %zmm1, %zmm0, %k3 {%k3} ; CHECK-NEXT: kmovw %k5, %eax ; CHECK-NEXT: kmovw %k4, %ecx -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %ecx, %xmm0 ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ; CHECK-NEXT: kmovw %k6, %eax ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll index c34fac3c994..79d8e53c514 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -696,7 +696,7 @@ define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-NEXT: vpcmpordw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc1,0x07] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] ; CHECK-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -744,7 +744,7 @@ define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { ; CHECK-NEXT: vpcmpordw %xmm1, %xmm0, %k3 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3f,0xd9,0x07] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] ; CHECK-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -793,7 +793,7 @@ define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-NEXT: vpcmporduw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc1,0x07] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] ; CHECK-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -841,7 +841,7 @@ define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { ; CHECK-NEXT: vpcmporduw %xmm1, %xmm0, %k3 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3e,0xd9,0x07] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] ; CHECK-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index 6510e6d7ac2..a067e0ad27b 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -16,7 +16,7 @@ define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-NEXT: vpcmpordd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc1,0x07] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] ; CHECK-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -64,7 +64,7 @@ define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { ; CHECK-NEXT: vpcmpordd %ymm1, %ymm0, %k3 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1f,0xd9,0x07] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] ; CHECK-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -113,7 +113,7 @@ define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-NEXT: vpcmpordud %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x07] ; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4] ; CHECK-NEXT: kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -161,7 +161,7 @@ define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) { ; CHECK-NEXT: vpcmpordud %ymm1, %ymm0, %k3 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1e,0xd9,0x07] ; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5] ; CHECK-NEXT: kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -210,7 +210,7 @@ define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: vpcmpordq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc1,0x07] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] ; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -258,7 +258,7 @@ define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { ; CHECK-NEXT: vpcmpordq %ymm1, %ymm0, %k7 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1f,0xf9,0x07] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -307,7 +307,7 @@ define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-NEXT: vpcmporduq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc1,0x07] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] ; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -355,7 +355,7 @@ define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) { ; CHECK-NEXT: vpcmporduq %ymm1, %ymm0, %k7 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1e,0xf9,0x07] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -406,7 +406,7 @@ define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-NEXT: vpcmpordd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc1,0x07] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] ; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -454,7 +454,7 @@ define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) { ; CHECK-NEXT: vpcmpordd %xmm1, %xmm0, %k7 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1f,0xf9,0x07] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -503,7 +503,7 @@ define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-NEXT: vpcmpordud %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc1,0x07] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] ; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -551,7 +551,7 @@ define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) { ; CHECK-NEXT: vpcmpordud %xmm1, %xmm0, %k7 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1e,0xf9,0x07] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -600,7 +600,7 @@ define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-NEXT: vpcmpordq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc1,0x07] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] ; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -648,7 +648,7 @@ define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { ; CHECK-NEXT: vpcmpordq %xmm1, %xmm0, %k7 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1f,0xf9,0x07] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -697,7 +697,7 @@ define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) { ; CHECK-NEXT: vpcmporduq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc1,0x07] ; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7] ; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] @@ -745,7 +745,7 @@ define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) { ; CHECK-NEXT: vpcmporduq %xmm1, %xmm0, %k7 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1e,0xf9,0x07] ; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0] ; CHECK-NEXT: kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd] -; CHECK-NEXT: vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00] +; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1] ; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02] ; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6] ; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04] diff --git a/test/CodeGen/X86/buildvec-insertvec.ll b/test/CodeGen/X86/buildvec-insertvec.ll index 61d8a4fdea4..730376acdc9 100644 --- a/test/CodeGen/X86/buildvec-insertvec.ll +++ b/test/CodeGen/X86/buildvec-insertvec.ll @@ -270,6 +270,7 @@ define <8 x i16> @test_buildvector_v8i16_register(i16 %a0, i16 %a1, i16 %a2, i16 define <8 x i16> @test_buildvector_v8i16_partial(i16 %a1, i16 %a3, i16 %a4, i16 %a5) { ; CHECK-LABEL: test_buildvector_v8i16_partial: ; CHECK: # BB#0: +; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: pinsrw $1, %edi, %xmm0 ; CHECK-NEXT: pinsrw $3, %esi, %xmm0 ; CHECK-NEXT: pinsrw $4, %edx, %xmm0 @@ -419,6 +420,7 @@ define <16 x i8> @test_buildvector_v16i8_partial(i8 %a2, i8 %a6, i8 %a8, i8 %a11 ; ; SSE41-LABEL: test_buildvector_v16i8_partial: ; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pinsrb $2, %edi, %xmm0 ; SSE41-NEXT: pinsrb $6, %esi, %xmm0 ; SSE41-NEXT: pinsrb $8, %edx, %xmm0 @@ -448,10 +450,9 @@ define <16 x i8> @test_buildvector_v16i8_partial(i8 %a2, i8 %a6, i8 %a8, i8 %a11 define <16 x i8> @test_buildvector_v16i8_register_zero(i8 %a0, i8 %a4, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) { ; SSE2-LABEL: test_buildvector_v16i8_register_zero: ; SSE2: # BB#0: -; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pinsrw $0, %eax, %xmm0 ; SSE2-NEXT: movzbl %sil, %eax +; SSE2-NEXT: movzbl %dil, %esi +; SSE2-NEXT: movd %esi, %xmm0 ; SSE2-NEXT: pinsrw $2, %eax, %xmm0 ; SSE2-NEXT: movzbl %dl, %eax ; SSE2-NEXT: pinsrw $3, %eax, %xmm0 diff --git a/test/CodeGen/X86/promote-vec3.ll b/test/CodeGen/X86/promote-vec3.ll index 7a496714622..5483090dab6 100644 --- a/test/CodeGen/X86/promote-vec3.ll +++ b/test/CodeGen/X86/promote-vec3.ll @@ -9,17 +9,16 @@ define <3 x i16> @zext_i8(<3 x i8>) { ; SSE3-LABEL: zext_i8: ; SSE3: # BB#0: ; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: pxor %xmm1, %xmm1 -; SSE3-NEXT: pinsrw $0, %eax, %xmm1 +; SSE3-NEXT: movd %eax, %xmm0 ; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; SSE3-NEXT: pinsrw $1, %eax, %xmm1 +; SSE3-NEXT: pinsrw $1, %eax, %xmm0 ; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; SSE3-NEXT: pinsrw $2, %eax, %xmm1 -; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE3-NEXT: movd %xmm1, %eax -; SSE3-NEXT: pextrw $2, %xmm1, %edx -; SSE3-NEXT: pextrw $4, %xmm1, %ecx +; SSE3-NEXT: pinsrw $2, %eax, %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: pextrw $2, %xmm0, %edx +; SSE3-NEXT: pextrw $4, %xmm0, %ecx ; SSE3-NEXT: # kill: %AX %AX %EAX ; SSE3-NEXT: # kill: %DX %DX %EDX ; SSE3-NEXT: # kill: %CX %CX %ECX @@ -74,7 +73,7 @@ define <3 x i16> @sext_i8(<3 x i8>) { ; SSE3-LABEL: sext_i8: ; SSE3: # BB#0: ; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; SSE3-NEXT: pinsrw $0, %eax, %xmm0 +; SSE3-NEXT: movd %eax, %xmm0 ; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; SSE3-NEXT: pinsrw $1, %eax, %xmm0 ; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax @@ -93,7 +92,7 @@ define <3 x i16> @sext_i8(<3 x i8>) { ; ; SSE41-LABEL: sext_i8: ; SSE41: # BB#0: -; SSE41-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0 +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE41-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0 ; SSE41-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0 ; SSE41-NEXT: pslld $24, %xmm0 @@ -108,7 +107,7 @@ define <3 x i16> @sext_i8(<3 x i8>) { ; ; AVX-32-LABEL: sext_i8: ; AVX-32: # BB#0: -; AVX-32-NEXT: vpinsrb $0, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; AVX-32-NEXT: vpslld $24, %xmm0, %xmm0 diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll index 923290411ae..8f9b8c156d3 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -482,7 +482,7 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx ; AVX512BW-NEXT: vpextrb $8, %xmm0, %edx ; AVX512BW-NEXT: vpextrb $0, %xmm0, %edi -; AVX512BW-NEXT: vpinsrb $0, %edi, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %edi, %xmm0 ; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0 ; AVX512BW-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 @@ -496,9 +496,9 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8: ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vmovdqu8 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 ; AVX512BWVL-NEXT: vpextrb $8, %xmm0, %eax +; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %ecx +; AVX512BWVL-NEXT: vmovd %ecx, %xmm1 ; AVX512BWVL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vextracti32x4 $1, %zmm0, %xmm2 ; AVX512BWVL-NEXT: vpextrb $0, %xmm2, %eax diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll index dda50b7b94b..f723672141a 100644 --- a/test/CodeGen/X86/vec_cast2.ll +++ b/test/CodeGen/X86/vec_cast2.ll @@ -97,10 +97,10 @@ define <8 x i8> @foo3_8(<8 x float> %src) { ; ; CHECK-WIDE-LABEL: foo3_8: ; CHECK-WIDE: ## BB#0: -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax -; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax +; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx +; CHECK-WIDE-NEXT: vmovd %ecx, %xmm1 ; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax @@ -134,10 +134,10 @@ define <4 x i8> @foo3_4(<4 x float> %src) { ; ; CHECK-WIDE-LABEL: foo3_4: ; CHECK-WIDE: ## BB#0: -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax -; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax +; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx +; CHECK-WIDE-NEXT: vmovd %ecx, %xmm1 ; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 ; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index cb710c8205b..47590cb8447 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1718,17 +1718,17 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b) ; SSE2-NEXT: movzbl (%rsi), %ecx ; SSE2-NEXT: shll $8, %ecx ; SSE2-NEXT: orl %eax, %ecx -; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movzwl %cx, %eax +; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pinsrw $0, %ecx, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,5,4,4,4] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,4,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,7] -; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: PR31364: @@ -1737,8 +1737,8 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b) ; SSSE3-NEXT: movzbl (%rsi), %ecx ; SSSE3-NEXT: shll $8, %ecx ; SSSE3-NEXT: orl %eax, %ecx -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pinsrw $0, %ecx, %xmm0 +; SSSE3-NEXT: movzwl %cx, %eax +; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] ; SSSE3-NEXT: retq ; diff --git a/test/CodeGen/X86/widen_bitops-0.ll b/test/CodeGen/X86/widen_bitops-0.ll index f8316d0e1ea..132a2fd928f 100644 --- a/test/CodeGen/X86/widen_bitops-0.ll +++ b/test/CodeGen/X86/widen_bitops-0.ll @@ -131,10 +131,10 @@ define i24 @or_i24_as_v8i3(i24 %a, i24 %b) nounwind { define <3 x i8> @and_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind { ; X32-SSE-LABEL: and_v3i8_as_i24: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0 +; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0 ; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0 -; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm1 +; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1 ; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1 ; X32-SSE-NEXT: pand %xmm0, %xmm1 @@ -172,10 +172,10 @@ define <3 x i8> @and_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind { define <3 x i8> @xor_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind { ; X32-SSE-LABEL: xor_v3i8_as_i24: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0 +; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0 ; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0 -; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm1 +; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1 ; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1 ; X32-SSE-NEXT: pxor %xmm0, %xmm1 @@ -213,10 +213,10 @@ define <3 x i8> @xor_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind { define <3 x i8> @or_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind { ; X32-SSE-LABEL: or_v3i8_as_i24: ; X32-SSE: # BB#0: -; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0 +; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0 ; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0 -; X32-SSE-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm1 +; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm1 ; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm1 ; X32-SSE-NEXT: por %xmm0, %xmm1 diff --git a/test/CodeGen/X86/widen_conv-3.ll b/test/CodeGen/X86/widen_conv-3.ll index f2e29337e6a..4ae19b8f5d2 100644 --- a/test/CodeGen/X86/widen_conv-3.ll +++ b/test/CodeGen/X86/widen_conv-3.ll @@ -65,7 +65,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) ; X86-SSE2-NEXT: shll $8, %edx ; X86-SSE2-NEXT: movzbl (%esp), %esi ; X86-SSE2-NEXT: orl %edx, %esi -; X86-SSE2-NEXT: pinsrw $0, %esi, %xmm0 +; X86-SSE2-NEXT: movd %esi, %xmm0 ; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx ; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0 ; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -115,7 +115,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) ; X64-SSE2-NEXT: shll $8, %eax ; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx ; X64-SSE2-NEXT: orl %eax, %ecx -; X64-SSE2-NEXT: pinsrw $0, %ecx, %xmm0 +; X64-SSE2-NEXT: movd %ecx, %xmm0 ; X64-SSE2-NEXT: movzbl 2(%rsi), %eax ; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0 ; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] diff --git a/test/CodeGen/X86/widen_conv-4.ll b/test/CodeGen/X86/widen_conv-4.ll index 90c4bbe6bb7..e574407f980 100644 --- a/test/CodeGen/X86/widen_conv-4.ll +++ b/test/CodeGen/X86/widen_conv-4.ll @@ -91,7 +91,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) ; X86-SSE2-NEXT: shll $8, %edx ; X86-SSE2-NEXT: movzbl (%esp), %esi ; X86-SSE2-NEXT: orl %edx, %esi -; X86-SSE2-NEXT: pinsrw $0, %esi, %xmm0 +; X86-SSE2-NEXT: movd %esi, %xmm0 ; X86-SSE2-NEXT: movzbl 2(%ecx), %ecx ; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1 @@ -140,7 +140,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) ; X64-SSE2-NEXT: shll $8, %eax ; X64-SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx ; X64-SSE2-NEXT: orl %eax, %ecx -; X64-SSE2-NEXT: pinsrw $0, %ecx, %xmm0 +; X64-SSE2-NEXT: movd %ecx, %xmm0 ; X64-SSE2-NEXT: movzbl 2(%rsi), %eax ; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0 ; X64-SSE2-NEXT: pxor %xmm1, %xmm1