[AMDGPU] move PHI nodes to AGPR class

author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Fri, 18 Oct 2019 22:48:45 +0000 (22:48 +0000)

committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Fri, 18 Oct 2019 22:48:45 +0000 (22:48 +0000)
author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Fri, 18 Oct 2019 22:48:45 +0000 (22:48 +0000)
committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Fri, 18 Oct 2019 22:48:45 +0000 (22:48 +0000)
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

index b3a76aa4046b37391acca1e07e2e05b6eaaffa63..65286751c12dff97bfadb1cb9c874d4eb902f2ed 100644 (file)
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -757,6 +757,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
  
  void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
    unsigned numVGPRUses = 0;
+  bool AllAGPRUses = true;
    SetVector<const MachineInstr *> worklist;
    SmallSet<const MachineInstr *, 4> Visited;
    worklist.insert(&MI);
@@ -766,6 +767,9 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
      unsigned Reg = Instr->getOperand(0).getReg();
      for (const auto &Use : MRI->use_operands(Reg)) {
        const MachineInstr *UseMI = Use.getParent();
+      AllAGPRUses &= (UseMI->isCopy() &&
+                      TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) ||
+                     TRI->isAGPR(*MRI, Use.getReg());
        if (UseMI->isCopy() || UseMI->isRegSequence()) {
          if (UseMI->isCopy() &&
            UseMI->getOperand(0).getReg().isPhysical() &&
@@ -794,11 +798,19 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
        }
      }
    }
+
+  Register PHIRes = MI.getOperand(0).getReg();
+  const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes);
+  if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) {
+    LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI);
+    MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0));
+  }
+
    bool hasVGPRInput = false;
    for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
      unsigned InputReg = MI.getOperand(i).getReg();
      MachineInstr *Def = MRI->getVRegDef(InputReg);
-    if (TRI->isVGPR(*MRI, InputReg)) {
+    if (TRI->isVectorRegister(*MRI, InputReg)) {
        if (Def->isCopy()) {
          unsigned SrcReg = Def->getOperand(1).getReg();
          const TargetRegisterClass *RC =
@@ -810,15 +822,14 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
        break;
      }
      else if (Def->isCopy() &&
-      TRI->isVGPR(*MRI, Def->getOperand(1).getReg())) {
+      TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) {
        hasVGPRInput = true;
        break;
      }
    }
-  unsigned PHIRes = MI.getOperand(0).getReg();
-  const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes);
  
-  if ((!TRI->isVGPR(*MRI, PHIRes) && RC0 != &AMDGPU::VReg_1RegClass) &&
+  if ((!TRI->isVectorRegister(*MRI, PHIRes) &&
+       RC0 != &AMDGPU::VReg_1RegClass) &&
      (hasVGPRInput || numVGPRUses > 1)) {
      LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
      TII->moveToVALU(MI);
diff --git a/test/CodeGen/AMDGPU/mfma-loop.ll b/test/CodeGen/AMDGPU/mfma-loop.ll

new file mode 100644 (file)

index 0000000..02f7c9b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
+; GCN-COUNT32: v_accvgpr_write_b32
+; GCN: [[LOOP:BB[0-9_]+]]:
+; GCN-NOT: v_accvgpr
+; GCN: v_mfma_f32_32x32x1f32
+; GCN-NOT: v_accvgpr
+; GCN: s_cbranch_scc1 [[LOOP]]
+; GCN-COUNT32: v_accvgpr_read_b32
+define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
+entry:
+  br label %for.cond.preheader
+
+for.cond.preheader:
+  %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ]
+  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
+  %inc = add nuw nsw i32 %c, 1
+  %cc = icmp eq i32 %inc, 16
+  br i1 %cc, label %exit, label %for.cond.preheader
+
+exit:
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  ret void
+}
+
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
+declare i32 @llvm.amdgcn.workitem.id.x()
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Fri, 18 Oct 2019 22:48:45 +0000 (22:48 +0000)
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Fri, 18 Oct 2019 22:48:45 +0000 (22:48 +0000)
lib/Target/AMDGPU/SIFixSGPRCopies.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/mfma-loop.ll	[new file with mode: 0644]	patch \| blob