From: Sanjay Patel Date: Tue, 22 Jan 2019 14:24:13 +0000 (+0000) Subject: [DAGCombiner] narrow vector binop with 2 insert subvector operands X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e5439d54578312472c0b44573ec292c1d8a8aa5a;p=llvm [DAGCombiner] narrow vector binop with 2 insert subvector operands vecbo (insertsubv undef, X, Z), (insertsubv undef, Y, Z) --> insertsubv VecC, (vecbo X, Y), Z This is another step in generic vector narrowing. It's also a step towards more horizontal op formation specifically for x86 (although we still failed to match those in the affected tests). The scalarization cases are also not optimal (we should be scalarizing those), but it's still an improvement to use a narrower vector op when we know part of the result must be constant because both inputs are undef in some vector lanes. I think a similar match but checking for a constant operand might help some of the cases in D51553. Differential Revision: https://reviews.llvm.org/D56875 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@351825 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 3437f0debae..b440bbe29fd 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18174,6 +18174,7 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDValue Ops[] = {LHS, RHS}; + EVT VT = N->getValueType(0); // See if we can constant fold the vector operation. if (SDValue Fold = DAG.FoldConstantVectorArithmetic( @@ -18191,7 +18192,6 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { ShuffleVectorSDNode *SVN1 = cast(RHS); if (SVN0->getMask().equals(SVN1->getMask())) { - EVT VT = N->getValueType(0); SDValue UndefVector = LHS.getOperand(1); SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS.getOperand(0), RHS.getOperand(0), @@ -18202,6 +18202,29 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { } } + // The following pattern is likely to emerge with vector reduction ops. Moving + // the binary operation ahead of insertion may allow using a narrower vector + // instruction that has better performance than the wide version of the op: + // VBinOp (ins undef, X, Z), (ins undef, Y, Z) --> ins VecC, (VBinOp X, Y), Z + if (LHS.getOpcode() == ISD::INSERT_SUBVECTOR && LHS.getOperand(0).isUndef() && + RHS.getOpcode() == ISD::INSERT_SUBVECTOR && RHS.getOperand(0).isUndef() && + LHS.getOperand(2) == RHS.getOperand(2) && + (LHS.hasOneUse() || RHS.hasOneUse())) { + SDValue X = LHS.getOperand(1); + SDValue Y = RHS.getOperand(1); + SDValue Z = LHS.getOperand(2); + EVT NarrowVT = X.getValueType(); + if (NarrowVT == Y.getValueType() && + TLI.isOperationLegalOrCustomOrPromote(N->getOpcode(), NarrowVT)) { + // (binop undef, undef) may not return undef, so compute that result. + SDLoc DL(N); + SDValue VecC = DAG.getNode(N->getOpcode(), DL, VT, DAG.getUNDEF(VT), + DAG.getUNDEF(VT)); + SDValue NarrowBO = DAG.getNode(N->getOpcode(), DL, NarrowVT, X, Y); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, VecC, NarrowBO, Z); + } + } + return SDValue(); } diff --git a/test/CodeGen/X86/avx512-hadd-hsub.ll b/test/CodeGen/X86/avx512-hadd-hsub.ll index 00063521c6d..0979657e0f6 100644 --- a/test/CodeGen/X86/avx512-hadd-hsub.ll +++ b/test/CodeGen/X86/avx512-hadd-hsub.ll @@ -111,14 +111,14 @@ define <16 x i32> @hadd_16_3(<16 x i32> %x225, <16 x i32> %x227) { ; KNL: # %bb.0: ; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; KNL-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: hadd_16_3: ; SKX: # %bb.0: ; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; SKX-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> @@ -134,14 +134,14 @@ define <16 x float> @fhadd_16_3(<16 x float> %x225, <16 x float> %x227) { ; KNL: # %bb.0: ; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; KNL-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vaddps %ymm0, %ymm2, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fhadd_16_3: ; SKX: # %bb.0: ; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; SKX-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vaddps %ymm0, %ymm2, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <16 x float> %x225, <16 x float> %x227, <16 x i32> @@ -156,14 +156,14 @@ define <8 x double> @fhadd_16_4(<8 x double> %x225, <8 x double> %x227) { ; KNL: # %bb.0: ; KNL-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; KNL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; KNL-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: fhadd_16_4: ; SKX: # %bb.0: ; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0 +; SKX-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ; SKX-NEXT: retq %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> diff --git a/test/CodeGen/X86/scalarize-fp.ll b/test/CodeGen/X86/scalarize-fp.ll index d9606b87352..5cf526dbff5 100644 --- a/test/CodeGen/X86/scalarize-fp.ll +++ b/test/CodeGen/X86/scalarize-fp.ll @@ -198,9 +198,8 @@ define <4 x double> @fadd_op1_constant_v4f64(double %x) nounwind { ; ; AVX-LABEL: fadd_op1_constant_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x double> undef, double %x, i32 0 %b = fadd <4 x double> %v, @@ -219,7 +218,7 @@ define <4 x double> @load_fadd_op1_constant_v4f64(double* %p) nounwind { ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %x = load double, double* %p %v = insertelement <4 x double> undef, double %x, i32 0 @@ -237,9 +236,8 @@ define <4 x double> @fsub_op0_constant_v4f64(double %x) nounwind { ; ; AVX-LABEL: fsub_op0_constant_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vsubpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x double> undef, double %x, i32 0 %b = fsub <4 x double> , %v @@ -258,7 +256,7 @@ define <4 x double> @load_fsub_op0_constant_v4f64(double* %p) nounwind { ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vsubpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vsubpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %x = load double, double* %p %v = insertelement <4 x double> undef, double %x, i32 0 @@ -275,9 +273,8 @@ define <4 x double> @fmul_op1_constant_v4f64(double %x) nounwind { ; ; AVX-LABEL: fmul_op1_constant_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x double> undef, double %x, i32 0 %b = fmul <4 x double> %v, @@ -296,7 +293,7 @@ define <4 x double> @load_fmul_op1_constant_v4f64(double* %p) nounwind { ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %x = load double, double* %p %v = insertelement <4 x double> undef, double %x, i32 0 @@ -313,9 +310,8 @@ define <4 x double> @fdiv_op1_constant_v4f64(double %x) nounwind { ; ; AVX-LABEL: fdiv_op1_constant_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vdivpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vdivpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x double> undef, double %x, i32 0 %b = fdiv <4 x double> %v, @@ -334,7 +330,7 @@ define <4 x double> @load_fdiv_op1_constant_v4f64(double* %p) nounwind { ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vdivpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vdivpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %x = load double, double* %p %v = insertelement <4 x double> undef, double %x, i32 0 @@ -352,9 +348,8 @@ define <4 x double> @fdiv_op0_constant_v4f64(double %x) nounwind { ; ; AVX-LABEL: fdiv_op0_constant_v4f64: ; AVX: # %bb.0: -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vdivpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %v = insertelement <4 x double> undef, double %x, i32 0 %b = fdiv <4 x double> , %v @@ -373,7 +368,7 @@ define <4 x double> @load_fdiv_op0_constant_v4f64(double* %p) nounwind { ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vdivpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %x = load double, double* %p %v = insertelement <4 x double> undef, double %x, i32 0 diff --git a/test/CodeGen/X86/vector-partial-undef.ll b/test/CodeGen/X86/vector-partial-undef.ll index 1cd3415d082..2b4ab11fea5 100644 --- a/test/CodeGen/X86/vector-partial-undef.ll +++ b/test/CodeGen/X86/vector-partial-undef.ll @@ -13,9 +13,7 @@ define <4 x i64> @xor_insert_insert(<2 x i64> %x, <2 x i64> %y) { ; ; AVX-LABEL: xor_insert_insert: ; AVX: # %bb.0: -; AVX-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %xw = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> %yw = shufflevector <2 x i64> %y, <2 x i64> undef, <4 x i32> @@ -32,9 +30,9 @@ define <4 x i64> @xor_insert_insert_high_half(<2 x i64> %x, <2 x i64> %y) { ; ; AVX-LABEL: xor_insert_insert_high_half: ; AVX: # %bb.0: -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq %xw = shufflevector <2 x i64> %x, <2 x i64> undef, <4 x i32> %yw = shufflevector <2 x i64> %y, <2 x i64> undef, <4 x i32>