From: Simon Pilgrim Date: Fri, 3 Feb 2017 17:59:58 +0000 (+0000) Subject: [X86][SSE] Add support for combining scalar_to_vector(extract_vector_elt) into a... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=da9f5183608bc474e9365517604fa71643c3f7dd;p=llvm [X86][SSE] Add support for combining scalar_to_vector(extract_vector_elt) into a target shuffle. Correctly flagging upper elements as undef. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@294020 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index a7a141050d9..f4b81063bcc 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5729,6 +5729,20 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, Ops.push_back(IsAndN ? N1 : N0); return true; } + case ISD::SCALAR_TO_VECTOR: { + // Match against a scalar_to_vector of an extract from a similar vector. + SDValue N0 = N.getOperand(0); + if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + N0.getOperand(0).getValueType() != VT || + !isa(N0.getOperand(1)) || + NumElts <= N0.getConstantOperandVal(1) || + !N->isOnlyUserOf(N0.getNode())) + return false; + Ops.push_back(N0.getOperand(0)); + Mask.push_back(N0.getConstantOperandVal(1)); + Mask.append(NumElts - 1, SM_SentinelUndef); + return true; + } case X86ISD::PINSRB: case X86ISD::PINSRW: { SDValue InVec = N.getOperand(0); diff --git a/test/CodeGen/X86/combine-fcopysign.ll b/test/CodeGen/X86/combine-fcopysign.ll index 807ac4e3fc6..43e09bfe5fe 100644 --- a/test/CodeGen/X86/combine-fcopysign.ll +++ b/test/CodeGen/X86/combine-fcopysign.ll @@ -292,7 +292,7 @@ define <4 x float> @combine_vec_fcopysign_fptrunc_sgn(<4 x float> %x, <4 x doubl ; SSE-NEXT: cvtsd2ss %xmm1, %xmm1 ; SSE-NEXT: andps %xmm4, %xmm1 ; SSE-NEXT: orps %xmm6, %xmm1 -; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: andps %xmm5, %xmm1 diff --git a/test/CodeGen/X86/pr29112.ll b/test/CodeGen/X86/pr29112.ll index 8bf704835ae..de2d5d79102 100644 --- a/test/CodeGen/X86/pr29112.ll +++ b/test/CodeGen/X86/pr29112.ll @@ -24,11 +24,11 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4 ; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1,2],xmm4[3] ; CHECK-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[3,1,2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[2,3] +; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1] ; CHECK-NEXT: vmovshdup {{.*#+}} xmm7 = xmm8[1,1,3,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[2,3] +; CHECK-NEXT: vunpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm10 = xmm7[0,1],xmm2[1],xmm7[3] ; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm3[3] ; CHECK-NEXT: vblendps {{.*#+}} xmm11 = xmm0[0,1,2],xmm3[3] diff --git a/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/test/CodeGen/X86/vector-shuffle-combining-xop.ll index eef7d5a1366..241c63c6acd 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -342,23 +342,18 @@ define void @buildvector_v4f32_0404(float %a, float %b, <4 x float>* %ptr) { ret void } -; FIXME: Failed to fold to vpermil2ps define void @buildvector_v4f32_07z6(float %a, <4 x float> %b, <4 x float>* %ptr) { ; X32-LABEL: buildvector_v4f32_07z6: ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],zero,xmm0[2] +; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm1[0],xmm0[3],zero,xmm0[2] ; X32-NEXT: vmovaps %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: buildvector_v4f32_07z6: ; X64: # BB#0: -; X64-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] -; X64-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],zero,xmm1[2] +; X64-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[3],zero,xmm1[2] ; X64-NEXT: vmovaps %xmm0, (%rdi) ; X64-NEXT: retq %b2 = extractelement <4 x float> %b, i32 2