From: Simon Pilgrim Date: Mon, 29 Apr 2019 19:52:59 +0000 (+0000) Subject: [X86][SSE] isHorizontalBinOp - add support for target shuffles X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=74390f6fa6749d8c753d2636ceb9f21057dc2e96;p=llvm [X86][SSE] isHorizontalBinOp - add support for target shuffles Add target shuffle decoding to isHorizontalBinOp as well as ISD::VECTOR_SHUFFLE support. This does mean we can go through bitcasts so we need to bitcast the extracted args to ensure they are the correct type Fixes PR39936 and should help with PR39920/PR39921 Differential Revision: https://reviews.llvm.org/D61245 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@359491 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index c120ec1079f..4c708dbb92d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -39248,51 +39248,65 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) { // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > // which is A horizontal-op B. - // At least one of the operands should be a vector shuffle. - if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE && - RHS.getOpcode() != ISD::VECTOR_SHUFFLE) - return false; - MVT VT = LHS.getSimpleValueType(); assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for horizontal add/sub"); + unsigned NumElts = VT.getVectorNumElements(); + + auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1, + SmallVectorImpl &ShuffleMask) { + if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) { + if (!Op.getOperand(0).isUndef()) + N0 = Op.getOperand(0); + if (!Op.getOperand(1).isUndef()) + N1 = Op.getOperand(1); + ArrayRef Mask = cast(Op)->getMask(); + ShuffleMask.append(Mask.begin(), Mask.end()); + return; + } + bool IsUnary; + SmallVector SrcOps; + SmallVector SrcShuffleMask; + SDValue BC = peekThroughBitcasts(Op); + if (isTargetShuffle(BC.getOpcode()) && + getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false, + SrcOps, SrcShuffleMask, IsUnary) && + SrcOps.size() <= 2 && SrcShuffleMask.size() == NumElts) { + N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue(); + N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue(); + ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end()); + } + }; // View LHS in the form // LHS = VECTOR_SHUFFLE A, B, LMask // If LHS is not a shuffle, then pretend it is the identity shuffle: // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> // NOTE: A default initialized SDValue represents an UNDEF of type VT. - unsigned NumElts = VT.getVectorNumElements(); SDValue A, B; - SmallVector LMask(NumElts); - if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) { - if (!LHS.getOperand(0).isUndef()) - A = LHS.getOperand(0); - if (!LHS.getOperand(1).isUndef()) - B = LHS.getOperand(1); - ArrayRef Mask = cast(LHS)->getMask(); - llvm::copy(Mask, LMask.begin()); - } else { - A = LHS; - for (unsigned i = 0; i != NumElts; ++i) - LMask[i] = i; - } + SmallVector LMask; + GetShuffle(LHS, A, B, LMask); // Likewise, view RHS in the form // RHS = VECTOR_SHUFFLE C, D, RMask SDValue C, D; - SmallVector RMask(NumElts); - if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) { - if (!RHS.getOperand(0).isUndef()) - C = RHS.getOperand(0); - if (!RHS.getOperand(1).isUndef()) - D = RHS.getOperand(1); - ArrayRef Mask = cast(RHS)->getMask(); - llvm::copy(Mask, RMask.begin()); - } else { + SmallVector RMask; + GetShuffle(RHS, C, D, RMask); + + // At least one of the operands should be a vector shuffle. + if (LMask.empty() && RMask.empty()) + return false; + + if (LMask.empty()) { + A = LHS; + for (unsigned i = 0; i != NumElts; ++i) + LMask.push_back(i); + } + + if (RMask.empty()) { C = RHS; for (unsigned i = 0; i != NumElts; ++i) - RMask[i] = i; + RMask.push_back(i); } // If A and B occur in reverse order in RHS, then canonicalize by commuting @@ -39359,7 +39373,8 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) && isHorizontalBinOp(LHS, RHS, IsFadd) && shouldUseHorizontalOp(LHS == RHS, DAG, Subtarget)) - return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); + return DAG.getNode(HorizOpcode, SDLoc(N), VT, DAG.getBitcast(VT, LHS), + DAG.getBitcast(VT, RHS)); return SDValue(); } @@ -42261,6 +42276,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, ArrayRef Ops) { return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops); }; + Op0 = DAG.getBitcast(VT, Op0); + Op1 = DAG.getBitcast(VT, Op1); return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HADDBuilder); } @@ -42392,6 +42409,8 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, ArrayRef Ops) { return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops); }; + Op0 = DAG.getBitcast(VT, Op0); + Op1 = DAG.getBitcast(VT, Op1); return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1}, HSUBBuilder); } diff --git a/test/CodeGen/X86/haddsub.ll b/test/CodeGen/X86/haddsub.ll index c685d1ec942..e9a74a32558 100644 --- a/test/CodeGen/X86/haddsub.ll +++ b/test/CodeGen/X86/haddsub.ll @@ -1632,9 +1632,7 @@ define float @PR39936_v8f32(<8 x float>) { ; AVX-SLOW-LABEL: PR39936_v8f32: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX-SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] ; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -1646,9 +1644,7 @@ define float @PR39936_v8f32(<8 x float>) { ; AVX-FAST-LABEL: PR39936_v8f32: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2] -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX-FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 ; AVX-FAST-NEXT: vzeroupper diff --git a/test/CodeGen/X86/phaddsub.ll b/test/CodeGen/X86/phaddsub.ll index 697c8d71c0f..356227c6c3f 100644 --- a/test/CodeGen/X86/phaddsub.ll +++ b/test/CodeGen/X86/phaddsub.ll @@ -803,32 +803,51 @@ define i32 @PR39936_v8i32(<8 x i32>) { ; SSSE3-FAST-NEXT: movd %xmm0, %eax ; SSSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR39936_v8i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: vmovd %xmm0, %eax -; AVX-SLOW-NEXT: vzeroupper -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR39936_v8i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: vmovd %xmm0, %eax +; AVX1-SLOW-NEXT: vzeroupper +; AVX1-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: PR39936_v8i32: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2] -; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; AVX-FAST-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vmovd %xmm0, %eax -; AVX-FAST-NEXT: vzeroupper -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: PR39936_v8i32: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vmovd %xmm0, %eax +; AVX1-FAST-NEXT: vzeroupper +; AVX1-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: PR39936_v8i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: PR39936_v8i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq %2 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> %3 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> %4 = add <8 x i32> %2, %3 diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 4fe07cae0dd..ff3edc3bc3b 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1732,9 +1732,7 @@ define <4 x double> @add_v4f64_0246_1357(<4 x double> %a, <4 x double> %b) { ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: add_v4f64_0246_1357: @@ -1775,9 +1773,7 @@ define <4 x double> @add_v4f64_4602_5713(<4 x double> %a, <4 x double> %b) { ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: add_v4f64_4602_5713: diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index 688ddce51f8..67547f20dba 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -2805,9 +2805,7 @@ define <8 x float> @add_v8f32_02468ACE_13579BDF(<8 x float> %a, <8 x float> %b) ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] -; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vhaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: add_v8f32_02468ACE_13579BDF: @@ -2848,9 +2846,7 @@ define <8 x float> @add_v8f32_8ACE0246_9BDF1357(<8 x float> %a, <8 x float> %b) ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] -; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vhaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: add_v8f32_8ACE0246_9BDF1357: