[AMDGPU] Fix-up cases where writelane has 2 SGPR operands

author David Stuttard <david.stuttard@amd.com>

Wed, 16 Oct 2019 14:37:39 +0000 (14:37 +0000)

committer David Stuttard <david.stuttard@amd.com>

Wed, 16 Oct 2019 14:37:39 +0000 (14:37 +0000)
author David Stuttard <david.stuttard@amd.com>
Wed, 16 Oct 2019 14:37:39 +0000 (14:37 +0000)
committer David Stuttard <david.stuttard@amd.com>
Wed, 16 Oct 2019 14:37:39 +0000 (14:37 +0000)
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

index 196650759f97c992e77a45c80c95e7955d2b98f7..b3a76aa4046b37391acca1e07e2e05b6eaaffa63 100644 (file)
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -684,6 +684,67 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
          }
          break;
        }
+      case AMDGPU::V_WRITELANE_B32: {
+        // Some architectures allow more than one constant bus access without
+        // SGPR restriction
+        if (ST.getConstantBusLimit(MI.getOpcode()) != 1)
+          break;
+
+        // Writelane is special in that it can use SGPR and M0 (which would
+        // normally count as using the constant bus twice - but in this case it
+        // is allowed since the lane selector doesn't count as a use of the
+        // constant bus). However, it is still required to abide by the 1 SGPR
+        // rule. Apply a fix here as we might have multiple SGPRs after
+        // legalizing VGPRs to SGPRs
+        int Src0Idx =
+            AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+        int Src1Idx =
+            AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
+        MachineOperand &Src0 = MI.getOperand(Src0Idx);
+        MachineOperand &Src1 = MI.getOperand(Src1Idx);
+
+        // Check to see if the instruction violates the 1 SGPR rule
+        if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) &&
+             Src0.getReg() != AMDGPU::M0) &&
+            (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) &&
+             Src1.getReg() != AMDGPU::M0)) {
+
+          // Check for trivially easy constant prop into one of the operands
+          // If this is the case then perform the operation now to resolve SGPR
+          // issue. If we don't do that here we will always insert a mov to m0
+          // that can't be resolved in later operand folding pass
+          bool Resolved = false;
+          for (MachineOperand *MO : {&Src0, &Src1}) {
+            if (Register::isVirtualRegister(MO->getReg())) {
+              MachineInstr *DefMI = MRI->getVRegDef(MO->getReg());
+              if (DefMI && TII->isFoldableCopy(*DefMI)) {
+                const MachineOperand &Def = DefMI->getOperand(0);
+                if (Def.isReg() &&
+                    MO->getReg() == Def.getReg() &&
+                    MO->getSubReg() == Def.getSubReg()) {
+                  const MachineOperand &Copied = DefMI->getOperand(1);
+                  if (Copied.isImm() &&
+                      TII->isInlineConstant(APInt(64, Copied.getImm(), true))) {
+                    MO->ChangeToImmediate(Copied.getImm());
+                    Resolved = true;
+                    break;
+                  }
+                }
+              }
+            }
+          }
+
+          if (!Resolved) {
+            // Haven't managed to resolve by replacing an SGPR with an immediate
+            // Move src1 to be in M0
+            BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                    TII->get(AMDGPU::COPY), AMDGPU::M0)
+                .add(Src1);
+            Src1.ChangeToRegister(AMDGPU::M0, false);
+          }
+        }
+        break;
+      }
        }
      }
    }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp

index 3bcee08267fc40b695d6b6c45f6f0ac6eb4cb152..3b50f973b78a653b661ef8598f3c63be8fc521ae 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3495,6 +3495,32 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
      }
    }
  
+  // Special case for writelane - this can break the multiple constant bus rule,
+  // but still can't use more than one SGPR register
+  if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
+    unsigned SGPRCount = 0;
+    Register SGPRUsed = AMDGPU::NoRegister;
+
+    for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) {
+      if (OpIdx == -1)
+        break;
+
+      const MachineOperand &MO = MI.getOperand(OpIdx);
+
+      if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
+        if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
+          if (MO.getReg() != SGPRUsed)
+            ++SGPRCount;
+          SGPRUsed = MO.getReg();
+        }
+      }
+      if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
+        ErrInfo = "WRITELANE instruction violates constant bus restriction";
+        return false;
+      }
+    }
+  }
+
    // Verify misc. restrictions on specific instructions.
    if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
        Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
diff --git a/test/CodeGen/AMDGPU/inserted-wait-states.mir b/test/CodeGen/AMDGPU/inserted-wait-states.mir

index 1ab10fa92f7b8918fde66e462cc3afdbf0124ebd..12644fd0a4933daef004c1790577e4aaabd4ca9e 100644 (file)
--- a/test/CodeGen/AMDGPU/inserted-wait-states.mir
+++ b/test/CodeGen/AMDGPU/inserted-wait-states.mir
@@ -317,8 +317,9 @@ body: |
      S_BRANCH %bb.3
  
    bb.3:
+    $m0 = S_MOV_B32 $sgpr4
      $vgpr0,implicit $vcc = V_ADD_I32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
-    $vgpr4 = V_WRITELANE_B32 $sgpr4, $vcc_lo, $vgpr4
+    $vgpr4 = V_WRITELANE_B32 $m0, $vcc_lo, $vgpr4
      S_ENDPGM 0
  
  ...
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll

index 2d0ebe8edb7289c0df9a8608ba8b7eeff4b07143..f7f1fd5ea6e519e3f559dba1cb349a897e4f7870 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -1,10 +1,12 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,CI,CIGFX9 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX9,CIGFX9 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
  
  declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
  
  ; CHECK-LABEL: {{^}}test_writelane_sreg:
-; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+; CIGFX9: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, m0
+; GFX10: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
  define amdgpu_kernel void @test_writelane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
    %oldval = load i32, i32 addrspace(1)* %out
    %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
@@ -35,11 +37,11 @@ define amdgpu_kernel void @test_writelane_vreg_lane(i32 addrspace(1)* %out, <2 x
    ret void
  }
  
-; TODO: m0 should be folded.
  ; CHECK-LABEL: {{^}}test_writelane_m0_sreg:
  ; CHECK: s_mov_b32 m0, -1
  ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
-; CHECK: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], s{{[0-9]+}}
+; CIGFX9: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], m0
+; GFX10: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], s{{[0-9]+}}
  define amdgpu_kernel void @test_writelane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
    %oldval = load i32, i32 addrspace(1)* %out
    %m0 = call i32 asm "s_mov_b32 m0, -1", "={m0}"()
@@ -59,7 +61,8 @@ define amdgpu_kernel void @test_writelane_imm(i32 addrspace(1)* %out, i32 %src0)
  
  ; CHECK-LABEL: {{^}}test_writelane_sreg_oldval:
  ; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], s{{[0-9]+}}
-; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
+; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
+; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
  define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
    %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
    store i32 %writelane, i32 addrspace(1)* %out, align 4
@@ -68,7 +71,8 @@ define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, i32 add
  
  ; CHECK-LABEL: {{^}}test_writelane_imm_oldval:
  ; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], 42
-; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
+; CIGFX9: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, m0
+; GFX10: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
  define amdgpu_kernel void @test_writelane_imm_oldval(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
    %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 42)
    store i32 %writelane, i32 addrspace(1)* %out, align 4
author	David Stuttard <david.stuttard@amd.com>
	Wed, 16 Oct 2019 14:37:39 +0000 (14:37 +0000)
committer	David Stuttard <david.stuttard@amd.com>
	Wed, 16 Oct 2019 14:37:39 +0000 (14:37 +0000)
lib/Target/AMDGPU/SIFixSGPRCopies.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/inserted-wait-states.mir		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll		patch \| blob \| history