AMDGPU: Avoid most waitcnts before calls

author Matt Arsenault <Matthew.Arsenault@amd.com>

Fri, 14 Jun 2019 21:52:26 +0000 (21:52 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Fri, 14 Jun 2019 21:52:26 +0000 (21:52 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Fri, 14 Jun 2019 21:52:26 +0000 (21:52 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Fri, 14 Jun 2019 21:52:26 +0000 (21:52 +0000)
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

index 34513d3450ac59dfc61955fa7469fdfe15e4cb2f..9e6dbd692e1982fc3f0d31b54792f0e66ab34c64 100644 (file)
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -808,6 +808,21 @@ static bool readsVCCZ(const MachineInstr &MI) {
           !MI.getOperand(1).isUndef();
  }
  
+/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
+static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
+  // Currently all conventions wait, but this may not always be the case.
+  //
+  // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
+  // senses to omit the wait and do it in the caller.
+  return true;
+}
+
+/// \returns true if the callee is expected to wait for any outstanding waits
+/// before returning.
+static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
+  return true;
+}
+
  ///  Generate s_waitcnt instruction to be placed before cur_Inst.
  ///  Instructions of a given type are returned in order,
  ///  but instructions of different types can complete out of order.
@@ -843,7 +858,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
    // NOTE: this could be improved with knowledge of all call sites or
    //   with knowledge of the called routines.
    if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
-      MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
+      MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
+      (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
      Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
    }
    // Resolve vm waits before gs-done.
@@ -923,91 +939,91 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
        }
      }
  
-#if 0 // TODO: the following code to handle CALL.
-    // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
-    // However, there is a problem with EXP_CNT, because the call cannot
-    // easily tell if a register is used in the function, and if it did, then
-    // the referring instruction would have to have an S_WAITCNT, which is
-    // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
-    // before the call.
-    if (MI.getOpcode() == SC_CALL) {
-      if (ScoreBrackets->getScoreUB(EXP_CNT) >
-        ScoreBrackets->getScoreLB(EXP_CNT)) {
-        ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
-        EmitWaitcnt |= CNT_MASK(EXP_CNT);
-      }
-    }
-#endif
-
-    // FIXME: Should not be relying on memoperands.
-    // Look at the source operands of every instruction to see if
-    // any of them results from a previous memory operation that affects
-    // its current usage. If so, an s_waitcnt instruction needs to be
-    // emitted.
-    // If the source operand was defined by a load, add the s_waitcnt
-    // instruction.
-    for (const MachineMemOperand *Memop : MI.memoperands()) {
-      unsigned AS = Memop->getAddrSpace();
-      if (AS != AMDGPUAS::LOCAL_ADDRESS)
-        continue;
-      unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
-      // VM_CNT is only relevant to vgpr or LDS.
-      ScoreBrackets.determineWait(
-          VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
-    }
+    if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
+      // Don't bother waiting on anything except the call address. The function
+      // is going to insert a wait on everything in its prolog. This still needs
+      // to be careful if the call target is a load (e.g. a GOT load).
+      Wait = AMDGPU::Waitcnt();
  
-    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-      const MachineOperand &Op = MI.getOperand(I);
-      const MachineRegisterInfo &MRIA = *MRI;
-      RegInterval Interval =
-          ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
+      int CallAddrOpIdx =
+          AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+      RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI,
+                                                          CallAddrOpIdx, false);
        for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-        if (TRI->isVGPR(MRIA, Op.getReg())) {
-          // VM_CNT is only relevant to vgpr or LDS.
-          ScoreBrackets.determineWait(
-              VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
-        }
          ScoreBrackets.determineWait(
              LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
        }
-    }
-    // End of for loop that looks at all source operands to decide vm_wait_cnt
-    // and lgk_wait_cnt.
-
-    // Two cases are handled for destination operands:
-    // 1) If the destination operand was defined by a load, add the s_waitcnt
-    // instruction to guarantee the right WAW order.
-    // 2) If a destination operand that was used by a recent export/store ins,
-    // add s_waitcnt on exp_cnt to guarantee the WAR order.
-    if (MI.mayStore()) {
+    } else {
        // FIXME: Should not be relying on memoperands.
+      // Look at the source operands of every instruction to see if
+      // any of them results from a previous memory operation that affects
+      // its current usage. If so, an s_waitcnt instruction needs to be
+      // emitted.
+      // If the source operand was defined by a load, add the s_waitcnt
+      // instruction.
        for (const MachineMemOperand *Memop : MI.memoperands()) {
          unsigned AS = Memop->getAddrSpace();
          if (AS != AMDGPUAS::LOCAL_ADDRESS)
            continue;
          unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+        // VM_CNT is only relevant to vgpr or LDS.
          ScoreBrackets.determineWait(
              VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
-        ScoreBrackets.determineWait(
-            EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
        }
-    }
-    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-      MachineOperand &Def = MI.getOperand(I);
-      const MachineRegisterInfo &MRIA = *MRI;
-      RegInterval Interval =
-          ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
-      for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-        if (TRI->isVGPR(MRIA, Def.getReg())) {
+
+      for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+        const MachineOperand &Op = MI.getOperand(I);
+        const MachineRegisterInfo &MRIA = *MRI;
+        RegInterval Interval =
+            ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
+        for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+          if (TRI->isVGPR(MRIA, Op.getReg())) {
+            // VM_CNT is only relevant to vgpr or LDS.
+            ScoreBrackets.determineWait(
+                VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+          }
+          ScoreBrackets.determineWait(
+              LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+        }
+      }
+      // End of for loop that looks at all source operands to decide vm_wait_cnt
+      // and lgk_wait_cnt.
+
+      // Two cases are handled for destination operands:
+      // 1) If the destination operand was defined by a load, add the s_waitcnt
+      // instruction to guarantee the right WAW order.
+      // 2) If a destination operand that was used by a recent export/store ins,
+      // add s_waitcnt on exp_cnt to guarantee the WAR order.
+      if (MI.mayStore()) {
+        // FIXME: Should not be relying on memoperands.
+        for (const MachineMemOperand *Memop : MI.memoperands()) {
+          unsigned AS = Memop->getAddrSpace();
+          if (AS != AMDGPUAS::LOCAL_ADDRESS)
+            continue;
+          unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
            ScoreBrackets.determineWait(
                VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
            ScoreBrackets.determineWait(
                EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
          }
-        ScoreBrackets.determineWait(
-            LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
        }
-    } // End of for loop that looks at all dest operands.
+      for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+        MachineOperand &Def = MI.getOperand(I);
+        const MachineRegisterInfo &MRIA = *MRI;
+        RegInterval Interval =
+            ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
+        for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+          if (TRI->isVGPR(MRIA, Def.getReg())) {
+            ScoreBrackets.determineWait(
+                VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+            ScoreBrackets.determineWait(
+                EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
+          }
+          ScoreBrackets.determineWait(
+              LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
+        }
+      } // End of for loop that looks at all dest operands.
+    }
    }
  
    // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
@@ -1224,6 +1240,14 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
      }
    } else if (TII->isSMRD(Inst)) {
      ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+  } else if (Inst.isCall()) {
+    if (callWaitsOnFunctionReturn(Inst)) {
+      // Act as a wait on everything
+      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(IV));
+    } else {
+      // May need to way wait for anything.
+      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
+    }
    } else {
      switch (Inst.getOpcode()) {
      case AMDGPU::S_SENDMSG:
diff --git a/test/CodeGen/AMDGPU/call-argument-types.ll b/test/CodeGen/AMDGPU/call-argument-types.ll

index 9432031527c6d75a8c01e43189406dc3989105b2..2301cfd94e70a2b0fd00b220c8746fc517c3f69c 100644 (file)
--- a/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -154,7 +154,7 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
  ; GCN-DAG: s_mov_b32 s4, s33
  ; GCN-DAG: s_mov_b32 s32, s3
  
-; GCN: s_waitcnt vmcnt(0)
+; GCN-NOT: s_waitcnt
  ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
  ; GCN-NEXT: s_endpgm
  define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
@@ -174,7 +174,7 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
  
  ; GCN-DAG: s_mov_b32 s32, s33
  
-; GCN: s_waitcnt vmcnt(0)
+; GCN-NOT: s_waitcnt
  ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
  ; GCN-NEXT: s_endpgm
  define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
@@ -205,7 +205,7 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
  
  ; GCN-DAG: s_mov_b32 s32, s33
  
-; GCN: s_waitcnt vmcnt(0)
+; GCN-NOT: s_waitcnt
  ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
  ; GCN-NEXT: s_endpgm
  define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
@@ -224,7 +224,7 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
  
  ; GCN-DAG: s_mov_b32 s32, s33
  
-; GCN: s_waitcnt vmcnt(0)
+; GCN-NOT: s_waitcnt
  ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}}
  ; GCN-NEXT: s_endpgm
  define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
@@ -267,8 +267,8 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
  
  ; GCN-LABEL: {{^}}test_call_external_void_func_v2i64:
  ; GCN: buffer_load_dwordx4 v[0:3]
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
  define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
    %val = load <2 x i64>, <2 x i64> addrspace(1)* null
    call void @external_void_func_v2i64(<2 x i64> %val)
@@ -290,8 +290,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
  ; GCN: buffer_load_dwordx4 v[0:3]
  ; GCN: v_mov_b32_e32 v4, 1
  ; GCN: v_mov_b32_e32 v5, 2
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
  define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
    %load = load <2 x i64>, <2 x i64> addrspace(1)* null
    %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>
@@ -307,8 +307,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
  ; GCN-DAG: v_mov_b32_e32 v6, 3
  ; GCN-DAG: v_mov_b32_e32 v7, 4
  
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
  define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
    %load = load <2 x i64>, <2 x i64> addrspace(1)* null
    %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -483,8 +483,8 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
  
  ; GCN-LABEL: {{^}}test_call_external_void_func_v2i32:
  ; GCN: buffer_load_dwordx2 v[0:1]
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
  define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
    %val = load <2 x i32>, <2 x i32> addrspace(1)* undef
    call void @external_void_func_v2i32(<2 x i32> %val)
@@ -527,8 +527,8 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
  
  ; GCN-LABEL: {{^}}test_call_external_void_func_v4i32:
  ; GCN: buffer_load_dwordx4 v[0:3]
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
  define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
    %val = load <4 x i32>, <4 x i32> addrspace(1)* undef
    call void @external_void_func_v4i32(<4 x i32> %val)
@@ -562,8 +562,8 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
  ; GCN-LABEL: {{^}}test_call_external_void_func_v8i32:
  ; GCN-DAG: buffer_load_dwordx4 v[0:3], off
  ; GCN-DAG: buffer_load_dwordx4 v[4:7], off
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
  define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
    %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef
    %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
@@ -591,8 +591,8 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
  ; GCN-DAG: buffer_load_dwordx4 v[4:7], off
  ; GCN-DAG: buffer_load_dwordx4 v[8:11], off
  ; GCN-DAG: buffer_load_dwordx4 v[12:15], off
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
  define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
    %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef
    %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
@@ -609,8 +609,8 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
  ; GCN-DAG: buffer_load_dwordx4 v[20:23], off
  ; GCN-DAG: buffer_load_dwordx4 v[24:27], off
  ; GCN-DAG: buffer_load_dwordx4 v[28:31], off
-; GCN: s_waitcnt
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
  define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
    %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
    %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
@@ -647,12 +647,11 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
    ret void
  }
  
-; FIXME: No wait after call
  ; GCN-LABEL: {{^}}test_call_external_i32_func_i32_imm:
  ; GCN: v_mov_b32_e32 v0, 42
  ; GCN: s_swappc_b64 s[30:31],
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v0, off, s[36:39], 0
+; GCN-NOT: s_waitcnt
+; GCN: buffer_store_dword v0, off, s[36:39], 0
  define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 {
    %val = call i32 @external_i32_func_i32(i32 42)
    store volatile i32 %val, i32 addrspace(1)* %out
@@ -662,8 +661,8 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)*
  ; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32:
  ; GCN: buffer_load_ubyte v0, off
  ; GCN: buffer_load_dword v1, off
-; GCN: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_swappc_b64
+; GCN-NOT: s_waitcnt
+; GCN: s_swappc_b64
  define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
    %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
    %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
diff --git a/test/CodeGen/AMDGPU/call-waitcnt.ll b/test/CodeGen/AMDGPU/call-waitcnt.ll

index bb3888c0d769f5bda41f8c6d135e5f5ce2f28ae0..0557c0fb613bd53d2d97b868373bfa72ebe7ba6f 100644 (file)
--- a/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -17,7 +17,6 @@ define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0
  ; GCN-NEXT:    s_getpc_b64 s[6:7]
  ; GCN-NEXT:    s_add_u32 s6, s6, func@rel32@lo+4
  ; GCN-NEXT:    s_addc_u32 s7, s7, func@rel32@hi+4
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
  ; GCN-NEXT:    s_endpgm
    %vgpr = load volatile i32, i32 addrspace(3)* %ptr
@@ -67,7 +66,6 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32)
  ; GCN-NEXT:    s_mov_b32 s32, s33
  ; GCN-NEXT:    v_mov_b32_e32 v32, 0
  ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    v_mov_b32_e32 v0, s34
  ; GCN-NEXT:    v_mov_b32_e32 v1, s35
  ; GCN-NEXT:    global_store_dword v[0:1], v32, off
@@ -91,7 +89,6 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)*
  ; GCN-NEXT:    s_addc_u32 s7, s7, func.return@rel32@hi+4
  ; GCN-NEXT:    s_mov_b32 s32, s33
  ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    v_mov_b32_e32 v1, s34
  ; GCN-NEXT:    v_mov_b32_e32 v2, s35
  ; GCN-NEXT:    global_store_dword v[1:2], v0, off
@@ -138,7 +135,7 @@ define void @tailcall_got_load(i32 addrspace(1)* %ptr, i32) #0 {
    ret void
  }
  
-; Need to wait for the address dependency
+; No need to wait for the load.
  define void @tail_call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 {
  ; GCN-LABEL: tail_call_memory_arg_load:
  ; GCN:       ; %bb.0:
@@ -147,7 +144,6 @@ define void @tail_call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 {
  ; GCN-NEXT:    s_add_u32 s6, s6, func@rel32@lo+4
  ; GCN-NEXT:    s_addc_u32 s7, s7, func@rel32@hi+4
  ; GCN-NEXT:    ds_read_b32 v0, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
  ; GCN-NEXT:    s_setpc_b64 s[6:7]
    %vgpr = load volatile i32, i32 addrspace(3)* %ptr
    tail call void @func(i32 %vgpr)
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Fri, 14 Jun 2019 21:52:26 +0000 (21:52 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Fri, 14 Jun 2019 21:52:26 +0000 (21:52 +0000)
lib/Target/AMDGPU/SIInsertWaitcnts.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/call-argument-types.ll		patch \| blob \| history
test/CodeGen/AMDGPU/call-waitcnt.ll		patch \| blob \| history