From 9cb56c603f5e64105aa75e32f1d8400ebfcb0d46 Mon Sep 17 00:00:00 2001
From: David Stuttard <david.stuttard@amd.com>
Date: Wed, 16 Oct 2019 14:37:39 +0000
Subject: [PATCH] [AMDGPU] Fix-up cases where writelane has 2 SGPR operands

Summary:
Even though writelane doesn't have the same constraints as other valu
instructions it still can't violate the >1 SGPR operand constraint

Due to later register propagation (e.g. fixing up vgpr operands via
readfirstlane) changing writelane to only have a single SGPR is tricky.

This implementation puts a new check after SIFixSGPRCopies that prevents
multiple SGPRs being used in any writelane instructions.

The algorithm used is to check for trivial copy prop of suitable constants into
one of the SGPR operands and perform that if possible. If this isn't possible
put an explicit copy of Src1 SGPR into M0 and use that instead (this is
allowable for writelane as the constraint is for SGPR read-port and not
constant-bus access).

Reviewers: rampitec, tpr, arsenm, nhaehnle

Reviewed By: rampitec, arsenm, nhaehnle

Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, mgorny, yaxunl, tpr, t-tye, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D51932

Change-Id: Ic7553fa57440f208d4dbc4794fc24345d7e0e9ea

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@375004 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIFixSGPRCopies.cpp        | 61 ++++++++++++++++++++
 lib/Target/AMDGPU/SIInstrInfo.cpp            | 26 +++++++++
 test/CodeGen/AMDGPU/inserted-wait-states.mir |  3 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 18 +++---
 4 files changed, 100 insertions(+), 8 deletions(-)

diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 196650759f9..b3a76aa4046 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -684,6 +684,67 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         }
         break;
       }
+      case AMDGPU::V_WRITELANE_B32: {
+        // Some architectures allow more than one constant bus access without
+        // SGPR restriction
+        if (ST.getConstantBusLimit(MI.getOpcode()) != 1)
+          break;
+
+        // Writelane is special in that it can use SGPR and M0 (which would
+        // normally count as using the constant bus twice - but in this case it
+        // is allowed since the lane selector doesn't count as a use of the
+        // constant bus). However, it is still required to abide by the 1 SGPR
+        // rule. Apply a fix here as we might have multiple SGPRs after
+        // legalizing VGPRs to SGPRs
+        int Src0Idx =
+            AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+        int Src1Idx =
+            AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
+        MachineOperand &Src0 = MI.getOperand(Src0Idx);
+        MachineOperand &Src1 = MI.getOperand(Src1Idx);
+
+        // Check to see if the instruction violates the 1 SGPR rule
+        if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) &&
+             Src0.getReg() != AMDGPU::M0) &&
+            (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) &&
+             Src1.getReg() != AMDGPU::M0)) {
+
+          // Check for trivially easy constant prop into one of the operands
+          // If this is the case then perform the operation now to resolve SGPR
+          // issue. If we don't do that here we will always insert a mov to m0
+          // that can't be resolved in later operand folding pass
+          bool Resolved = false;
+          for (MachineOperand *MO : {&Src0, &Src1}) {
+            if (Register::isVirtualRegister(MO->getReg())) {
+              MachineInstr *DefMI = MRI->getVRegDef(MO->getReg());
+              if (DefMI && TII->isFoldableCopy(*DefMI)) {
+                const MachineOperand &Def = DefMI->getOperand(0);
+                if (Def.isReg() &&
+                    MO->getReg() == Def.getReg() &&
+                    MO->getSubReg() == Def.getSubReg()) {
+                  const MachineOperand &Copied = DefMI->getOperand(1);
+                  if (Copied.isImm() &&
+                      TII->isInlineConstant(APInt(64, Copied.getImm(), true))) {
+                    MO->ChangeToImmediate(Copied.getImm());
+                    Resolved = true;
+                    break;
+                  }
+                }
+              }
+            }
+          }
+
+          if (!Resolved) {
+            // Haven't managed to resolve by replacing an SGPR with an immediate
+            // Move src1 to be in M0
+            BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                    TII->get(AMDGPU::COPY), AMDGPU::M0)
+                .add(Src1);
+            Src1.ChangeToRegister(AMDGPU::M0, false);
+          }
+        }
+        break;
+      }
       }
     }
   }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 3bcee08267f..3b50f973b78 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3495,6 +3495,32 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
+  // Special case for writelane - this can break the multiple constant bus rule,
+  // but still can't use more than one SGPR register
+  if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
+    unsigned SGPRCount = 0;
+    Register SGPRUsed = AMDGPU::NoRegister;
+
+    for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) {
+      if (OpIdx == -1)
+        break;
+
+      const MachineOperand &MO = MI.getOperand(OpIdx);
+
+      if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
+        if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
+          if (MO.getReg() != SGPRUsed)
+            ++SGPRCount;
+          SGPRUsed = MO.getReg();
+        }
+      }
+      if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
+        ErrInfo = "WRITELANE instruction violates constant bus restriction";
+        return false;
+      }
+    }
+  }
+
   // Verify misc. restrictions on specific instructions.
   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
diff --git a/test/CodeGen/AMDGPU/inserted-wait-states.mir b/test/CodeGen/AMDGPU/inserted-wait-states.mir
index 1ab10fa92f7..12644fd0a49 100644
--- a/test/CodeGen/AMDGPU/inserted-wait-states.mir
+++ b/test/CodeGen/AMDGPU/inserted-wait-states.mir
@@ -317,8 +317,9 @@ body: |
     S_BRANCH %bb.3
 
   bb.3:
+    $m0 = S_MOV_B32 $sgpr4
     $vgpr0,implicit $vcc = V_ADD_I32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
-    $vgpr4 = V_WRITELANE_B32 $sgpr4, $vcc_lo, $vgpr4
+    $vgpr4 = V_WRITELANE_B32 $m0, $vcc_lo, $vgpr4
     S_ENDPGM 0
 
 ...
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 2d0ebe8edb7..f7f1fd5ea6e 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -1,10 +1,12 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,CI,CIGFX9 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX9,CIGFX9 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
 
 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
 
 ; CHECK-LABEL: {{^}}test_writelane_sreg:
-; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
+; GFX10: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @test_writelane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
   %oldval = load i32, i32 addrspace(1)* %out
   %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
@@ -35,11 +37,11 @@ define amdgpu_kernel void @test_writelane_vreg_lane(i32 addrspace(1)* %out, <2 x
   ret void
 }
 
-; TODO: m0 should be folded.
 ; CHECK-LABEL: {{^}}test_writelane_m0_sreg:
 ; CHECK: s_mov_b32 m0, -1
 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
-; CHECK: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], s{{[0-9]+}}
+; CIGFX9: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], m0
+; GFX10: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], s{{[0-9]+}}
 define amdgpu_kernel void @test_writelane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
   %oldval = load i32, i32 addrspace(1)* %out
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
@@ -59,7 +61,8 @@ define amdgpu_kernel void @test_writelane_imm(i32 addrspace(1)* %out, i32 %src0)
 
 ; CHECK-LABEL: {{^}}test_writelane_sreg_oldval:
 ; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], s{{[0-9]+}}
-; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
+; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
+; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
   %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
   store i32 %writelane, i32 addrspace(1)* %out, align 4
@@ -68,7 +71,8 @@ define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, i32 add
 
 ; CHECK-LABEL: {{^}}test_writelane_imm_oldval:
 ; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], 42
-; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
+; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
+; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
 define amdgpu_kernel void @test_writelane_imm_oldval(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
   %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 42)
   store i32 %writelane, i32 addrspace(1)* %out, align 4
-- 
2.50.1