From bf5049eaf138dfe5c07fbed93c434fd603c9faee Mon Sep 17 00:00:00 2001 From: Alexander Timofeev Date: Thu, 3 Jan 2019 19:55:32 +0000 Subject: [PATCH] [AMDGPU] Fix scalar operand folding bug that causes SHOC performance regression. Detailed description: SIFoldOperands::foldInstOperand iterates over the operand uses calling the function that changes def-use iteratorson the way. As a result loop exits immediately when def-use iterator is changed. Hence, the operand is folded to the very first use instruction only. This makes VGPR live along the whole basic block and increases register pressure significantly. The performance drop observed in SHOC DeviceMemory test is caused by this bug. Proposed fix: collect uses to separate container for further processing in another loop. Testing: make check-llvm SHOC performance test. Reviewers: rampitec, ronlieb Differential Revision: https://reviews.llvm.org/D56161 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@350350 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIFoldOperands.cpp | 10 +++++++--- test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll | 20 ++++++++++---------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index bd0bc734060..f4e86695836 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -854,13 +854,17 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, } } else { // Folding register. + SmallVector UsesToProcess; for (MachineRegisterInfo::use_iterator Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end(); Use != E; ++Use) { - MachineInstr *UseMI = Use->getParent(); + UsesToProcess.push_back(Use); + } + for (auto U : UsesToProcess) { + MachineInstr *UseMI = U->getParent(); - foldOperand(OpToFold, UseMI, Use.getOperandNo(), - FoldList, CopiesToReplace); + foldOperand(OpToFold, UseMI, U.getOperandNo(), + FoldList, CopiesToReplace); } } diff --git a/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll index 3746aa8784b..a3f176b3ef0 100644 --- a/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ b/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -131,10 +131,10 @@ define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) { ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} ; GCN-DENORM-DAG: v_rcp_f32_e32 ; GCN-DENORM-DAG: v_rcp_f32_e32 ; GCN-DENORM-DAG: v_rcp_f32_e32 @@ -166,10 +166,10 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* % ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]] ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} ; GCN-DENORM-DAG: v_rcp_f32_e32 ; GCN-DENORM-DAG: v_rcp_f32_e32 ; GCN-DENORM-DAG: v_rcp_f32_e32 @@ -246,7 +246,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}} ; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}} ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]] ; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}} @@ -288,7 +288,7 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(<4 x float> addrspace(1)* %arg) { ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} +; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}} ; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}} ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]] ; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}} -- 2.50.1