From aa108d865d3062361edbe9c662af43accd8456d8 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 27 Oct 2017 09:06:07 +0000 Subject: [PATCH] DAG: Fold fma (fneg x), K, y -> fma x, -K, y git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@316753 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 8 +++++ test/CodeGen/AMDGPU/fma-combine.ll | 46 ++++++++++++++++++++++++ test/CodeGen/AMDGPU/fmuladd.f16.ll | 4 +-- test/CodeGen/AMDGPU/fmuladd.f32.ll | 4 +-- 4 files changed, 58 insertions(+), 4 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b79ff7f146d..a64a79383d0 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10123,6 +10123,14 @@ SDValue DAGCombiner::visitFMA(SDNode *N) { // TODO: The FMA node should have flags that propagate to this node. return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg); } + + // fma (fneg x), K, y -> fma x -K, y + if (N0.getOpcode() == ISD::FNEG && + (TLI.isOperationLegal(ISD::ConstantFP, VT) || + (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT)))) { + return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), + DAG.getNode(ISD::FNEG, DL, VT, N1, Flags), N2); + } } if (Options.UnsafeFPMath) { diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll index 7526d08bdbe..132f0538d63 100644 --- a/test/CodeGen/AMDGPU/fma-combine.ll +++ b/test/CodeGen/AMDGPU/fma-combine.ll @@ -625,5 +625,51 @@ define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out, ret void } +; Make sure negative constant cancels out fneg +; GCN-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] +; GCN-NOT: [[A]] +; GCN-NOT: [[B]] +; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]] +define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %r1.fneg = fsub float -0.000000e+00, %r1 + + %r3 = tail call float @llvm.fma.f32(float -2.0, float %r1.fneg, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fma_2.0_neg_a_b_f32: +; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]] +; GCN-NOT: [[A]] +; GCN-NOT: [[B]] +; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]] +define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid + + %r1 = load volatile float, float addrspace(1)* %gep.0 + %r2 = load volatile float, float addrspace(1)* %gep.1 + + %r1.fneg = fsub float -0.000000e+00, %r1 + + %r3 = tail call float @llvm.fma.f32(float 2.0, float %r1.fneg, float %r2) + store float %r3, float addrspace(1)* %gep.out + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } + diff --git a/test/CodeGen/AMDGPU/fmuladd.f16.ll b/test/CodeGen/AMDGPU/fmuladd.f16.ll index 980d68ceded..5f9e9650be6 100644 --- a/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -154,7 +154,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half ; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]] +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -178,7 +178,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]] ; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]] +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/test/CodeGen/AMDGPU/fmuladd.f32.ll b/test/CodeGen/AMDGPU/fmuladd.f32.ll index 4b1e41ff91e..fee3c95c473 100644 --- a/test/CodeGen/AMDGPU/fmuladd.f32.ll +++ b/test/CodeGen/AMDGPU/fmuladd.f32.ll @@ -218,7 +218,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, flo ; SI-FLUSH: buffer_store_dword [[R2]] ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]] +; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] @@ -249,7 +249,7 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, ; SI-FLUSH: buffer_store_dword [[R2]] ; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]] +; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] ; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] ; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] -- 2.50.1