From 5ecf4bbd9760ce811c76a31efbf1d1fe37a93588 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 14 Jun 2019 21:52:26 +0000
Subject: [PATCH] AMDGPU: Avoid most waitcnts before calls

Currently you get extra waits, because waits are inserted for the
register dependencies of the call, and the function prolog waits on
everything.

Currently waits are still inserted on returns. It may make sense to
not do this, and wait in the caller instead.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363465 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIInsertWaitcnts.cpp     | 156 ++++++++++++---------
 test/CodeGen/AMDGPU/call-argument-types.ll |  49 ++++---
 test/CodeGen/AMDGPU/call-waitcnt.ll        |   6 +-
 3 files changed, 115 insertions(+), 96 deletions(-)

diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 34513d3450a..9e6dbd692e1 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -808,6 +808,21 @@ static bool readsVCCZ(const MachineInstr &MI) {
          !MI.getOperand(1).isUndef();
 }
 
+/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
+static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
+  // Currently all conventions wait, but this may not always be the case.
+  //
+  // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
+  // senses to omit the wait and do it in the caller.
+  return true;
+}
+
+/// \returns true if the callee is expected to wait for any outstanding waits
+/// before returning.
+static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
+  return true;
+}
+
 ///  Generate s_waitcnt instruction to be placed before cur_Inst.
 ///  Instructions of a given type are returned in order,
 ///  but instructions of different types can complete out of order.
@@ -843,7 +858,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
   // NOTE: this could be improved with knowledge of all call sites or
   //   with knowledge of the called routines.
   if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
-      MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
+      MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
+      (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
     Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
   }
   // Resolve vm waits before gs-done.
@@ -923,91 +939,91 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
       }
     }
 
-#if 0 // TODO: the following code to handle CALL.
-    // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
-    // However, there is a problem with EXP_CNT, because the call cannot
-    // easily tell if a register is used in the function, and if it did, then
-    // the referring instruction would have to have an S_WAITCNT, which is
-    // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
-    // before the call.
-    if (MI.getOpcode() == SC_CALL) {
-      if (ScoreBrackets->getScoreUB(EXP_CNT) >
-        ScoreBrackets->getScoreLB(EXP_CNT)) {
-        ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
-        EmitWaitcnt |= CNT_MASK(EXP_CNT);
-      }
-    }
-#endif
-
-    // FIXME: Should not be relying on memoperands.
-    // Look at the source operands of every instruction to see if
-    // any of them results from a previous memory operation that affects
-    // its current usage. If so, an s_waitcnt instruction needs to be
-    // emitted.
-    // If the source operand was defined by a load, add the s_waitcnt
-    // instruction.
-    for (const MachineMemOperand *Memop : MI.memoperands()) {
-      unsigned AS = Memop->getAddrSpace();
-      if (AS != AMDGPUAS::LOCAL_ADDRESS)
-        continue;
-      unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
-      // VM_CNT is only relevant to vgpr or LDS.
-      ScoreBrackets.determineWait(
-          VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
-    }
+    if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
+      // Don't bother waiting on anything except the call address. The function
+      // is going to insert a wait on everything in its prolog. This still needs
+      // to be careful if the call target is a load (e.g. a GOT load).
+      Wait = AMDGPU::Waitcnt();
 
-    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-      const MachineOperand &Op = MI.getOperand(I);
-      const MachineRegisterInfo &MRIA = *MRI;
-      RegInterval Interval =
-          ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
+      int CallAddrOpIdx =
+          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+      RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI,
+                                                          CallAddrOpIdx, false);
       for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-        if (TRI->isVGPR(MRIA, Op.getReg())) {
-          // VM_CNT is only relevant to vgpr or LDS.
-          ScoreBrackets.determineWait(
-              VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
-        }
         ScoreBrackets.determineWait(
             LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
       }
-    }
-    // End of for loop that looks at all source operands to decide vm_wait_cnt
-    // and lgk_wait_cnt.
-
-    // Two cases are handled for destination operands:
-    // 1) If the destination operand was defined by a load, add the s_waitcnt
-    // instruction to guarantee the right WAW order.
-    // 2) If a destination operand that was used by a recent export/store ins,
-    // add s_waitcnt on exp_cnt to guarantee the WAR order.
-    if (MI.mayStore()) {
+    } else {
       // FIXME: Should not be relying on memoperands.
+      // Look at the source operands of every instruction to see if
+      // any of them results from a previous memory operation that affects
+      // its current usage. If so, an s_waitcnt instruction needs to be
+      // emitted.
+      // If the source operand was defined by a load, add the s_waitcnt
+      // instruction.
       for (const MachineMemOperand *Memop : MI.memoperands()) {
         unsigned AS = Memop->getAddrSpace();
         if (AS != AMDGPUAS::LOCAL_ADDRESS)
           continue;
         unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+        // VM_CNT is only relevant to vgpr or LDS.
         ScoreBrackets.determineWait(
             VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
-        ScoreBrackets.determineWait(
-            EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
       }
-    }
-    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-      MachineOperand &Def = MI.getOperand(I);
-      const MachineRegisterInfo &MRIA = *MRI;
-      RegInterval Interval =
-          ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
-      for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-        if (TRI->isVGPR(MRIA, Def.getReg())) {
+
+      for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+        const MachineOperand &Op = MI.getOperand(I);
+        const MachineRegisterInfo &MRIA = *MRI;
+        RegInterval Interval =
+            ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
+        for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+          if (TRI->isVGPR(MRIA, Op.getReg())) {
+            // VM_CNT is only relevant to vgpr or LDS.
+            ScoreBrackets.determineWait(
+                VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+          }
+          ScoreBrackets.determineWait(
+              LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+        }
+      }
+      // End of for loop that looks at all source operands to decide vm_wait_cnt
+      // and lgk_wait_cnt.
+
+      // Two cases are handled for destination operands:
+      // 1) If the destination operand was defined by a load, add the s_waitcnt
+      // instruction to guarantee the right WAW order.
+      // 2) If a destination operand that was used by a recent export/store ins,
+      // add s_waitcnt on exp_cnt to guarantee the WAR order.
+      if (MI.mayStore()) {
+        // FIXME: Should not be relying on memoperands.
+        for (const MachineMemOperand *Memop : MI.memoperands()) {
+          unsigned AS = Memop->getAddrSpace();
+          if (AS != AMDGPUAS::LOCAL_ADDRESS)
+            continue;
+          unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
           ScoreBrackets.determineWait(
               VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
           ScoreBrackets.determineWait(
               EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
         }
-        ScoreBrackets.determineWait(
-            LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
       }
-    } // End of for loop that looks at all dest operands.
+      for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+        MachineOperand &Def = MI.getOperand(I);
+        const MachineRegisterInfo &MRIA = *MRI;
+        RegInterval Interval =
+            ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
+        for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+          if (TRI->isVGPR(MRIA, Def.getReg())) {
+            ScoreBrackets.determineWait(
+                VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+            ScoreBrackets.determineWait(
+                EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
+          }
+          ScoreBrackets.determineWait(
+              LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+        }
+      } // End of for loop that looks at all dest operands.
+    }
   }
 
   // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
@@ -1224,6 +1240,14 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
     }
   } else if (TII->isSMRD(Inst)) {
     ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+  } else if (Inst.isCall()) {
+    if (callWaitsOnFunctionReturn(Inst)) {
+      // Act as a wait on everything
+      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(IV));
+    } else {
+      // May need to way wait for anything.
+      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
+    }
   } else {
     switch (Inst.getOpcode()) {
     case AMDGPU::S_SENDMSG:
diff --git a/test/CodeGen/AMDGPU/call-argument-types.ll b/test/CodeGen/AMDGPU/call-argument-types.ll
index 9432031527c..2301cfd94e7 100644
--- a/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -154,7 +154,7 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GCN-DAG: s_mov_b32 s4, s33
 ; GCN-DAG: s_mov_b32 s32, s3
 
-; GCN: s_waitcnt vmcnt(0)
+; GCN-NOT: s_waitcnt
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
@@ -174,7 +174,7 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 
 ; GCN-DAG: s_mov_b32 s32, s33
 
-; GCN: s_waitcnt vmcnt(0)
+; GCN-NOT: s_waitcnt
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
@@ -205,7 +205,7 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 
 ; GCN-DAG: s_mov_b32 s32, s33
 
-; GCN: s_waitcnt vmcnt(0)
+; GCN-NOT: s_waitcnt
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
@@ -224,7 +224,7 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 
 ; GCN-DAG: s_mov_b32 s32, s33
 
-; GCN: s_waitcnt vmcnt(0)
+; GCN-NOT: s_waitcnt
 ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
@@ -267,8 +267,8 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_v2i64:
 ; GCN: buffer_load_dwordx4 v[0:3]
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* null
   call void @external_void_func_v2i64(<2 x i64> %val)
@@ -290,8 +290,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
 ; GCN: buffer_load_dwordx4 v[0:3]
 ; GCN: v_mov_b32_e32 v4, 1
 ; GCN: v_mov_b32_e32 v5, 2
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
   %load = load <2 x i64>, <2 x i64> addrspace(1)* null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>
@@ -307,8 +307,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
 ; GCN-DAG: v_mov_b32_e32 v6, 3
 ; GCN-DAG: v_mov_b32_e32 v7, 4
 
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
   %load = load <2 x i64>, <2 x i64> addrspace(1)* null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -483,8 +483,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_v2i32:
 ; GCN: buffer_load_dwordx2 v[0:1]
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* undef
   call void @external_void_func_v2i32(<2 x i32> %val)
@@ -527,8 +527,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
 
 ; GCN-LABEL: {{^}}test_call_external_void_func_v4i32:
 ; GCN: buffer_load_dwordx4 v[0:3]
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* undef
   call void @external_void_func_v4i32(<4 x i32> %val)
@@ -562,8 +562,8 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
 ; GCN-LABEL: {{^}}test_call_external_void_func_v8i32:
 ; GCN-DAG: buffer_load_dwordx4 v[0:3], off
 ; GCN-DAG: buffer_load_dwordx4 v[4:7], off
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
   %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef
   %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
@@ -591,8 +591,8 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
 ; GCN-DAG: buffer_load_dwordx4 v[4:7], off
 ; GCN-DAG: buffer_load_dwordx4 v[8:11], off
 ; GCN-DAG: buffer_load_dwordx4 v[12:15], off
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
   %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef
   %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
@@ -609,8 +609,8 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
 ; GCN-DAG: buffer_load_dwordx4 v[20:23], off
 ; GCN-DAG: buffer_load_dwordx4 v[24:27], off
 ; GCN-DAG: buffer_load_dwordx4 v[28:31], off
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
   %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
   %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
@@ -647,12 +647,11 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
   ret void
 }
 
-; FIXME: No wait after call
 ; GCN-LABEL: {{^}}test_call_external_i32_func_i32_imm:
 ; GCN: v_mov_b32_e32 v0, 42
 ; GCN: s_swappc_b64 s[30:31],
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v0, off, s[36:39], 0
+; GCN-NOT: s_waitcnt
+; GCN: buffer_store_dword v0, off, s[36:39], 0
 define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 {
   %val = call i32 @external_i32_func_i32(i32 42)
   store volatile i32 %val, i32 addrspace(1)* %out
@@ -662,8 +661,8 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)*
 ; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32:
 ; GCN: buffer_load_ubyte v0, off
 ; GCN: buffer_load_dword v1, off
-; GCN: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
   %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
   %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
diff --git a/test/CodeGen/AMDGPU/call-waitcnt.ll b/test/CodeGen/AMDGPU/call-waitcnt.ll
index bb3888c0d76..0557c0fb613 100644
--- a/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -17,7 +17,6 @@ define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0
 ; GCN-NEXT:    s_getpc_b64 s[6:7]
 ; GCN-NEXT:    s_add_u32 s6, s6, func@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s7, s7, func@rel32@hi+4
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT:    s_endpgm
   %vgpr = load volatile i32, i32 addrspace(3)* %ptr
@@ -67,7 +66,6 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32)
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_mov_b32_e32 v32, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN-NEXT:    v_mov_b32_e32 v1, s35
 ; GCN-NEXT:    global_store_dword v[0:1], v32, off
@@ -91,7 +89,6 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)*
 ; GCN-NEXT:    s_addc_u32 s7, s7, func.return@rel32@hi+4
 ; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN-NEXT:    v_mov_b32_e32 v2, s35
 ; GCN-NEXT:    global_store_dword v[1:2], v0, off
@@ -138,7 +135,7 @@ define void @tailcall_got_load(i32 addrspace(1)* %ptr, i32) #0 {
   ret void
 }
 
-; Need to wait for the address dependency
+; No need to wait for the load.
 define void @tail_call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 {
 ; GCN-LABEL: tail_call_memory_arg_load:
 ; GCN:       ; %bb.0:
@@ -147,7 +144,6 @@ define void @tail_call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 {
 ; GCN-NEXT:    s_add_u32 s6, s6, func@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s7, s7, func@rel32@hi+4
 ; GCN-NEXT:    ds_read_b32 v0, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[6:7]
   %vgpr = load volatile i32, i32 addrspace(3)* %ptr
   tail call void @func(i32 %vgpr)
-- 
2.50.1