[AMDGPU] Fix scalar operand folding bug that causes SHOC performance regression.

author Alexander Timofeev <Alexander.Timofeev@amd.com>

Thu, 3 Jan 2019 19:55:32 +0000 (19:55 +0000)

committer Alexander Timofeev <Alexander.Timofeev@amd.com>

Thu, 3 Jan 2019 19:55:32 +0000 (19:55 +0000)
author Alexander Timofeev <Alexander.Timofeev@amd.com>
Thu, 3 Jan 2019 19:55:32 +0000 (19:55 +0000)
committer Alexander Timofeev <Alexander.Timofeev@amd.com>
Thu, 3 Jan 2019 19:55:32 +0000 (19:55 +0000)
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp

index bd0bc734060c8e3573b1c99a008a20ce57d4b6c9..f4e8669583699686c0e219f4da313e341c2c4c98 100644 (file)
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -854,13 +854,17 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
      }
    } else {
      // Folding register.
+    SmallVector <MachineRegisterInfo::use_iterator, 4> UsesToProcess;
      for (MachineRegisterInfo::use_iterator
             Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
           Use != E; ++Use) {
-      MachineInstr *UseMI = Use->getParent();
+      UsesToProcess.push_back(Use);
+    }
+    for (auto U : UsesToProcess) {
+      MachineInstr *UseMI = U->getParent();
  
-      foldOperand(OpToFold, UseMI, Use.getOperandNo(),
-                  FoldList, CopiesToReplace);
+      foldOperand(OpToFold, UseMI, U.getOperandNo(),
+        FoldList, CopiesToReplace);
      }
    }
  
diff --git a/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll

index 3746aa8784bff03d33bc988f6111c79fe9ea35c1..a3f176b3ef025301f69d9537cdfd741af0f5193c 100644 (file)
--- a/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
+++ b/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
@@ -131,10 +131,10 @@ define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
  ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
  ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
  ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
-; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
-; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
-; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
-; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
+; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
+; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
+; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
  ; GCN-DENORM-DAG: v_rcp_f32_e32
  ; GCN-DENORM-DAG: v_rcp_f32_e32
  ; GCN-DENORM-DAG: v_rcp_f32_e32
@@ -166,10 +166,10 @@ define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* %
  ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
  ; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
  ; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
-; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
  ; GCN-DENORM-DAG: v_rcp_f32_e32
  ; GCN-DENORM-DAG: v_rcp_f32_e32
  ; GCN-DENORM-DAG: v_rcp_f32_e32
@@ -246,7 +246,7 @@ define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace
  ; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
  
  ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
  ; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}}
  ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]]
  ; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}}
@@ -288,7 +288,7 @@ define amdgpu_kernel void @div_v4_c_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
  ; GCN-DAG:        v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
  
  ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
  ; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}}
  ; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]]
  ; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}}
author	Alexander Timofeev <Alexander.Timofeev@amd.com>
	Thu, 3 Jan 2019 19:55:32 +0000 (19:55 +0000)
committer	Alexander Timofeev <Alexander.Timofeev@amd.com>
	Thu, 3 Jan 2019 19:55:32 +0000 (19:55 +0000)
lib/Target/AMDGPU/SIFoldOperands.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll		patch \| blob \| history