From 9c26d11c10e30a15b4a89f0b36d612843b708c1a Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Tue, 27 Aug 2019 15:17:46 +0000 Subject: [PATCH] [DAGCombiner] cancel fnegs from multiplied operands of FMA (-X) * (-Y) + Z --> X * Y + Z This is a missing optimization that shows up as a potential regression in D66050, so we should solve it first. We appear to be partly missing this fold in IR as well. We do handle the simpler case already: (-X) * (-Y) --> X * Y And it might be beneficial to make the constraint less conservative (eg, if both operands are cheap, but not necessarily cheaper), but that causes infinite looping for the existing fmul transform. Differential Revision: https://reviews.llvm.org/D66755 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@370071 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 44 ++++++++++++++++-------- test/CodeGen/AMDGPU/fneg-combines.ll | 2 +- test/CodeGen/PowerPC/fneg.ll | 7 ++-- 3 files changed, 33 insertions(+), 20 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index eea22e220f1..7187466b055 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -516,6 +516,7 @@ namespace { bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS, SDValue &CC) const; bool isOneUseSetCC(SDValue N) const; + bool isCheaperToUseNegatedFPOps(SDValue X, SDValue Y); SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, unsigned HiOp); @@ -12110,6 +12111,22 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { return SDValue(); } +/// Return true if both inputs are at least as cheap in negated form and at +/// least one input is strictly cheaper in negated form. +bool DAGCombiner::isCheaperToUseNegatedFPOps(SDValue X, SDValue Y) { + const TargetOptions &Options = DAG.getTarget().Options; + if (char LHSNeg = isNegatibleForFree(X, LegalOperations, TLI, &Options, + ForCodeSize)) + if (char RHSNeg = isNegatibleForFree(Y, LegalOperations, TLI, &Options, + ForCodeSize)) + // Both negated operands are at least as cheap as their counterparts. + // Check to see if at least one is cheaper negated. + if (LHSNeg == 2 || RHSNeg == 2) + return true; + + return false; +} + SDValue DAGCombiner::visitFMUL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -12180,21 +12197,11 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) { if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) return DAG.getNode(ISD::FNEG, DL, VT, N0); - // fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y) - if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options, - ForCodeSize)) { - if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options, - ForCodeSize)) { - // Both can be negated for free, check to see if at least one is cheaper - // negated. - if (LHSNeg == 2 || RHSNeg == 2) - return DAG.getNode(ISD::FMUL, DL, VT, - GetNegatedExpression(N0, DAG, LegalOperations, - ForCodeSize), - GetNegatedExpression(N1, DAG, LegalOperations, - ForCodeSize), - Flags); - } + // -N0 * -N1 --> N0 * N1 + if (isCheaperToUseNegatedFPOps(N0, N1)) { + SDValue NegN0 = GetNegatedExpression(N0, DAG, LegalOperations, ForCodeSize); + SDValue NegN1 = GetNegatedExpression(N1, DAG, LegalOperations, ForCodeSize); + return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1, Flags); } // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X)) @@ -12273,6 +12280,13 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2); } + // (-N0 * -N1) + N2 --> (N0 * N1) + N2 + if (isCheaperToUseNegatedFPOps(N0, N1)) { + SDValue NegN0 = GetNegatedExpression(N0, DAG, LegalOperations, ForCodeSize); + SDValue NegN1 = GetNegatedExpression(N1, DAG, LegalOperations, ForCodeSize); + return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2, Flags); + } + if (UnsafeFPMath) { if (N0CFP && N0CFP->isZero()) return N2; diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll index 867ab278541..458018fbf4f 100644 --- a/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/test/CodeGen/AMDGPU/fneg-combines.ll @@ -1205,7 +1205,7 @@ define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, flo ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]] -; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], -[[B]], [[C]] +; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]] ; GCN-SAFE: v_xor_b32_e32 v{{[[0-9]+}}, 0x80000000, [[FMA]] ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]] diff --git a/test/CodeGen/PowerPC/fneg.ll b/test/CodeGen/PowerPC/fneg.ll index edcfe2d5c86..328ffecd176 100644 --- a/test/CodeGen/PowerPC/fneg.ll +++ b/test/CodeGen/PowerPC/fneg.ll @@ -20,8 +20,7 @@ declare float @llvm.fmuladd.f32(float, float, float) #4 define float @fma_fneg_fneg(float %x, float %y, float %z) { ; CHECK-LABEL: fma_fneg_fneg: ; CHECK: # %bb.0: -; CHECK-NEXT: fneg f0, f2 -; CHECK-NEXT: fnmsubs f1, f1, f0, f3 +; CHECK-NEXT: fmadds f1, f1, f2, f3 ; CHECK-NEXT: blr %negx = fneg float %x %negy = fneg float %y @@ -32,8 +31,8 @@ define float @fma_fneg_fneg(float %x, float %y, float %z) { define float @fma_fneg_fsub(float %x, float %y0, float %y1, float %z) { ; CHECK-LABEL: fma_fneg_fsub: ; CHECK: # %bb.0: -; CHECK-NEXT: fsubs f0, f2, f3 -; CHECK-NEXT: fnmsubs f1, f1, f0, f4 +; CHECK-NEXT: fsubs f0, f3, f2 +; CHECK-NEXT: fmadds f1, f1, f0, f4 ; CHECK-NEXT: blr %negx = fneg float %x %negy = fsub nsz float %y0, %y1 -- 2.40.0