AMDGPU: Force s_waitcnt after GWS instructions

author Matt Arsenault <Matthew.Arsenault@amd.com>

Fri, 19 Jul 2019 19:47:30 +0000 (19:47 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Fri, 19 Jul 2019 19:47:30 +0000 (19:47 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Fri, 19 Jul 2019 19:47:30 +0000 (19:47 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Fri, 19 Jul 2019 19:47:30 +0000 (19:47 +0000)
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td

index c52eaaa3fdc5f31ea3ff3f9b5807589143659061..0cc21a6aa16ca46aed66941081b25364beda66df 100644 (file)
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -317,13 +317,16 @@ class DS_GWS <string opName, dag ins, string asmOps>
  
  class DS_GWS_0D <string opName>
  : DS_GWS<opName,
-  (ins offset:$offset, gds:$gds), "$offset gds">;
+  (ins offset:$offset, gds:$gds), "$offset gds"> {
+  let hasSideEffects = 1;
+}
  
  class DS_GWS_1D <string opName>
  : DS_GWS<opName,
    (ins VGPR_32:$data0, offset:$offset, gds:$gds), "$data0$offset gds"> {
  
    let has_gws_data0 = 1;
+  let hasSideEffects = 1;
  }
  
  class DS_VOID <string opName> : DS_Pseudo<opName,
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index 40b93f4ae3bc8f8bf337cccbed408b5a4444c4d8..e1c38456de0a8508d8abdaafaf0fcc0b239ed316 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3069,6 +3069,20 @@ splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
    return std::make_pair(LoopBB, RemainderBB);
  }
  
+/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
+void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+  auto I = MI.getIterator();
+  auto E = std::next(I);
+
+  BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+    .addImm(0);
+
+  MIBundleBuilder Bundler(*MBB, I, E);
+  finalizeBundle(*MBB, Bundler.begin());
+}
+
  MachineBasicBlock *
  SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
                                           MachineBasicBlock *BB) const {
@@ -3108,8 +3122,7 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
      MRI.setSimpleHint(Data0, Src->getReg());
    }
  
-  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT))
-    .addImm(0);
+  bundleInstWithWaitcnt(MI);
  
    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
  
@@ -3828,8 +3841,12 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
    case AMDGPU::DS_GWS_SEMA_P:
    case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
    case AMDGPU::DS_GWS_BARRIER:
-    if (getSubtarget()->hasGWSAutoReplay())
+    // A s_waitcnt 0 is required to be the instruction immediately following.
+    if (getSubtarget()->hasGWSAutoReplay()) {
+      bundleInstWithWaitcnt(MI);
        return BB;
+    }
+
      return emitGWSMemViolTestLoop(MI, BB);
    default:
      return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h

index 1f8cf4ffe1a0064759aa1e6452585495897068b1..27c6445d60afbd4af0cc45c7611c9218191c5172 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -315,6 +315,7 @@ public:
    MachineBasicBlock *splitKillBlock(MachineInstr &MI,
                                      MachineBasicBlock *BB) const;
  
+  void bundleInstWithWaitcnt(MachineInstr &MI) const;
    MachineBasicBlock *emitGWSMemViolTestLoop(MachineInstr &MI,
                                              MachineBasicBlock *BB) const;
  
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp

index ba8ed6993a560b44871d78dc1635172e31c9354e..1fce2dbe77433a58902db908c90bf47fe8e68c8f 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1531,7 +1531,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
      break;
    }
    case TargetOpcode::BUNDLE: {
-    if (!MI.mayLoad())
+    if (!MI.mayLoad() || MI.hasUnmodeledSideEffects())
        return false;
  
      // If it is a load it must be a memory clause
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll

index a1affccd9f5be1c58ea4f1fbbe2b8edc9c012558..756e86cde9c47712a2255247fb1673cfdde49324 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
@@ -4,6 +4,11 @@
  ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP %s
  ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,NOLOOP,GFX10 %s
  
+; Make sure the op is emitted bundled with a waitcnt with and without the retry loop, and the bundle is not removed by ExpandPostRAPseudos.
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=postrapseudos -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MIR %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=postrapseudos -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MIR %s
+
+
  ; Minimum offset
  ; GCN-LABEL: {{^}}gws_barrier_offset0:
  ; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
@@ -18,11 +23,19 @@
  ; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1)
  ; LOOP-NEXT: s_cmp_lg_u32 [[GETREG]], 0
  ; LOOP-NEXT: s_cbranch_scc1 [[LOOP]]
+
+; MIR-LABEL: name: gws_barrier_offset0{{$}}
+; MIR: BUNDLE implicit{{( killed)?}} $vgpr0, implicit $m0, implicit $exec {
+; MIR-NEXT: DS_GWS_BARRIER $vgpr0, 1, -1, implicit $m0, implicit $exec :: (load 4 from custom GWSResource)
+; MIR-NEXT: S_WAITCNT 0
+; MIR-NEXT: }
  define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
    call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
    ret void
  }
  
+; MIR-LABEL: name: gws_barrier_offset63{{$}}
+
  ; Maximum offset
  ; GCN-LABEL: {{^}}gws_barrier_offset63:
  ; NOLOOP-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
@@ -103,7 +116,7 @@ define amdgpu_kernel void @gws_barrier_save_m0_barrier_constant_offset(i32 %val)
  ; Make sure this increments lgkmcnt
  ; GCN-LABEL: {{^}}gws_barrier_lgkmcnt:
  ; NOLOOP: ds_gws_barrier v0 offset:1 gds{{$}}
-; NOLOOP-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
+; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; NOLOOP-NEXT: s_setpc_b64
  define void @gws_barrier_lgkmcnt(i32 %val) {
    call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
@@ -122,7 +135,7 @@ define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, i32 addrspace(1)* %
  
  ; GCN-LABEL: {{^}}gws_barrier_wait_after:
  ; NOLOOP: ds_gws_barrier v0 offset:8 gds
-; NOLOOP-NEXT: s_waitcnt expcnt(0){{$}}
+; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; NOLOOP-NEXT: load_dword
  define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
    call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
@@ -135,6 +148,7 @@ define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %p
  ; NOLOOP: store_dword
  ; NOLOOP: s_waitcnt vmcnt(0) lgkmcnt(0)
  ; NOLOOP: ds_gws_barrier v0 offset:8 gds
+; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
    store i32 0, i32 addrspace(1)* %ptr
    fence release
@@ -142,9 +156,11 @@ define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)*
    ret void
  }
  
+; FIXME: Extra waitcnt
  ; GCN-LABEL: {{^}}gws_barrier_fence_after:
  ; NOLOOP: ds_gws_barrier v0 offset:8 gds
  ; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
  ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
  ; NOLOOP-NEXT: load_dword
  define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
@@ -158,7 +174,9 @@ define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %
  ; GCN-LABEL: {{^}}gws_init_barrier:
  ; NOLOOP: s_mov_b32 m0, -1
  ; NOLOOP: ds_gws_init v0 offset:8 gds
+; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; NOLOOP-NEXT: ds_gws_barrier v0 offset:8 gds
+; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 {
    call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
    call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
@@ -169,9 +187,11 @@ define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 {
  ; GCN-LABEL: {{^}}gws_init_fence_barrier:
  ; NOLOOP: s_mov_b32 m0, -1
  ; NOLOOP: ds_gws_init v0 offset:8 gds
+; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; NOLOOP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
  ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
  ; NOLOOP-NEXT: ds_gws_barrier v0 offset:8 gds
+; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  define amdgpu_kernel void @gws_init_fence_barrier(i32 %val) #0 {
    call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
    fence release
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll

index 075ec5073b31b2b323c346e3ae001943a14f8a67..11f926486a0f5d743f9d5dd1670bfcf6f41b9d2f 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll
@@ -111,7 +111,7 @@ define amdgpu_kernel void @gws_init_save_m0_init_constant_offset(i32 %val) #0 {
  
  ; GCN-LABEL: {{^}}gws_init_lgkmcnt:
  ; NOLOOP: ds_gws_init v0 offset:1 gds{{$}}
-; NOLOOP-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
+; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; NOLOOP-NEXT: s_setpc_b64
  define void @gws_init_lgkmcnt(i32 %val) {
    call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0)
@@ -120,8 +120,10 @@ define void @gws_init_lgkmcnt(i32 %val) {
  
  ; Does not imply memory fence on its own
  ; GCN-LABEL: {{^}}gws_init_wait_before:
-; NOLOOP: s_waitcnt
+; NOLOOP: s_waitcnt lgkmcnt(0)
  ; NOLOOP-NOT: s_waitcnt
+; NOLOOP: ds_gws_init
+; NOLOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  define amdgpu_kernel void @gws_init_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
    store i32 0, i32 addrspace(1)* %ptr
    call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Fri, 19 Jul 2019 19:47:30 +0000 (19:47 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Fri, 19 Jul 2019 19:47:30 +0000 (19:47 +0000)
lib/Target/AMDGPU/DSInstructions.td		patch \| blob \| history
lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIISelLowering.h		patch \| blob \| history
lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll		patch \| blob \| history