From: Simon Pilgrim Date: Sun, 5 Feb 2017 22:50:29 +0000 (+0000) Subject: [X86][SSE] Replace insert_vector_elt(vec, -1, idx) with shuffle X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0f1d1738924173de7681c9c7115ba45d98849c13;p=llvm [X86][SSE] Replace insert_vector_elt(vec, -1, idx) with shuffle Similar to what we already do for zero elt insertion, we can quickly rematerialize 'allbits' vectors so to avoid a unnecessary gpr value and insertion into a vector git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@294162 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1aa7c422012..af06cd3a719 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -13844,17 +13844,21 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, auto *N2C = cast(N2); unsigned IdxVal = N2C->getZExtValue(); - // If we are clearing out a element, we do this more efficiently with a - // blend shuffle than a costly integer insertion. - // TODO: would other rematerializable values (e.g. allbits) benefit as well? + bool IsZeroElt = X86::isZeroNode(N1); + bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1); + + // If we are inserting a element, see if we can do this more efficiently with + // a blend shuffle with a rematerializable vector than a costly integer + // insertion. // TODO: pre-SSE41 targets will tend to use bit masking - this could still // be beneficial if we are inserting several zeros and can combine the masks. - if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) { - SmallVector ClearMask; + if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) { + SmallVector BlendMask; for (unsigned i = 0; i != NumElts; ++i) - ClearMask.push_back(i == IdxVal ? i + NumElts : i); - SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask); + BlendMask.push_back(i == IdxVal ? i + NumElts : i); + SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl) + : DAG.getConstant(-1, dl, VT); + return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask); } // If the vector is wider than 128 bits, extract the 128-bit subvector, insert diff --git a/test/CodeGen/X86/avx-cvt-3.ll b/test/CodeGen/X86/avx-cvt-3.ll index 45a6421cf22..066719b3bfe 100644 --- a/test/CodeGen/X86/avx-cvt-3.ll +++ b/test/CodeGen/X86/avx-cvt-3.ll @@ -48,27 +48,17 @@ define <8 x float> @sitofp_shuffle_zero_v8i32(<8 x i32> %a0) { define <8 x float> @sitofp_insert_allbits_v8i32(<8 x i32> %a0) { ; X86-LABEL: sitofp_insert_allbits_v8i32: ; X86: # BB#0: -; X86-NEXT: movl $-1, %eax -; X86-NEXT: vpinsrd $0, %eax, %xmm0, %xmm1 -; X86-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; X86-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1 -; X86-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7] ; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: sitofp_insert_allbits_v8i32: ; X64: # BB#0: -; X64-NEXT: movl $-1, %eax -; X64-NEXT: vpinsrd $0, %eax, %xmm0, %xmm1 -; X64-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] -; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1 -; X64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7] ; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X64-NEXT: retq %1 = insertelement <8 x i32> %a0, i32 -1, i32 0 @@ -105,9 +95,9 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) { ; X86: # BB#0: ; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] -; X86-NEXT: movl $-1, %eax -; X86-NEXT: vpinsrd $2, %eax, %xmm0, %xmm1 -; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: movl $2, %eax ; X86-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1 @@ -121,9 +111,9 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) { ; X64: # BB#0: ; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] -; X64-NEXT: movl $-1, %eax -; X64-NEXT: vpinsrd $2, %eax, %xmm0, %xmm1 -; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: movl $2, %eax ; X64-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1