[AMDGPU] Account workgroup size in LDS occupancy limits

author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Wed, 1 Feb 2017 22:59:50 +0000 (22:59 +0000)

committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Wed, 1 Feb 2017 22:59:50 +0000 (22:59 +0000)
author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Wed, 1 Feb 2017 22:59:50 +0000 (22:59 +0000)
committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Wed, 1 Feb 2017 22:59:50 +0000 (22:59 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

index b928e88719269cc3cbc83e0970eb99d5cc25a225..ca25634afdb38bcbbc843ddee6cf79793847ff3a 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -204,7 +204,8 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
      }
    }
  
-  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);
+  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
+                                                          F);
  
    // Restrict local memory usage so that we don't drastically reduce occupancy,
    // unless it is already significantly reduced.
@@ -225,7 +226,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
  
    // Round up to the next tier of usage.
    unsigned MaxSizeWithWaveCount
-    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy);
+    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
  
    // Program is possibly broken by using more local mem than available.
    if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

index c85d2159bdb78e5f60496e9b103ebe76e9abfe56..c413f574cd4dc9a0cbd4336a5d038ed2ddcd50f2 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -132,62 +132,26 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
    initializeSubtargetDependencies(TT, GPU, FS);
  }
  
-// FIXME: These limits are for SI. Did they change with the larger maximum LDS
-// size?
-unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
-  switch (NWaves) {
-  case 10:
-    return 1638;
-  case 9:
-    return 1820;
-  case 8:
-    return 2048;
-  case 7:
-    return 2340;
-  case 6:
-    return 2730;
-  case 5:
-    return 3276;
-  case 4:
-    return 4096;
-  case 3:
-    return 5461;
-  case 2:
-    return 8192;
-  default:
+unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
+  const Function &F) const {
+  if (NWaves == 1)
      return getLocalMemorySize();
-  }
+  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
+  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+  unsigned MaxWaves = getMaxWavesPerEU();
+  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
  }
  
-unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
-  if (Bytes <= 1638)
-    return 10;
-
-  if (Bytes <= 1820)
-    return 9;
-
-  if (Bytes <= 2048)
-    return 8;
-
-  if (Bytes <= 2340)
-    return 7;
-
-  if (Bytes <= 2730)
-    return 6;
-
-  if (Bytes <= 3276)
-    return 5;
-
-  if (Bytes <= 4096)
-    return 4;
-
-  if (Bytes <= 5461)
-    return 3;
-
-  if (Bytes <= 8192)
-    return 2;
-
-  return 1;
+unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
+  const Function &F) const {
+  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
+  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+  unsigned MaxWaves = getMaxWavesPerEU();
+  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
+  unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
+  NumWaves = std::min(NumWaves, MaxWaves);
+  NumWaves = std::max(NumWaves, 1u);
+  return NumWaves;
  }
  
  std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h

index f66ebd6afc27ce9bba3c6b599a53af2b938ea8ac..83eda9bfbb67c57a6ecd5068475b109ce2e2b896 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -274,11 +274,12 @@ public:
  
    /// Return the amount of LDS that can be used that will not restrict the
    /// occupancy lower than WaveCount.
-  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const;
+  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
+                                           const Function &) const;
  
    /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
    /// the given LDS memory size is the only constraint.
-  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const;
+  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
  
    bool hasFP16Denormals() const {
      return FP64FP16Denormals;
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp

index e714eeffad008ee1210a84767a82549765c2c254..cc0cd8b29843375a6b82eca3605321b361fd59fc 100644 (file)
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -35,7 +35,8 @@ static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
    unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs),
                                        ST.getOccupancyWithNumVGPRs(VGPRs));
    return std::min(MinRegOccupancy,
-                  ST.getOccupancyWithLocalMemSize(MFI->getLDSSize()));
+                  ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
+                                                  *MF.getFunction()));
  }
  
  void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
diff --git a/test/CodeGen/AMDGPU/indirect-private-64.ll b/test/CodeGen/AMDGPU/indirect-private-64.ll

index 4db87c3c1b64a5ca3cbd1cdc648436f019ddb44b..a4acfa6d306ad70a0f499457af5b8c1bf1f75db8 100644 (file)
--- a/test/CodeGen/AMDGPU/indirect-private-64.ll
+++ b/test/CodeGen/AMDGPU/indirect-private-64.ll
@@ -22,8 +22,8 @@ declare void @llvm.amdgcn.s.barrier() #0
  ; CI-PROMOTE: ds_read_b64
  define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
    %val = load double, double addrspace(1)* %in, align 8
-  %array = alloca [16 x double], align 8
-  %ptr = getelementptr inbounds [16 x double], [16 x double]* %array, i32 0, i32 %b
+  %array = alloca [8 x double], align 8
+  %ptr = getelementptr inbounds [8 x double], [8 x double]* %array, i32 0, i32 %b
    store double %val, double* %ptr, align 8
    call void @llvm.amdgcn.s.barrier()
    %result = load double, double* %ptr, align 8
@@ -53,8 +53,8 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
  ; CI-PROMOTE: ds_read2_b64
  define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
    %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
-  %array = alloca [8 x <2 x double>], align 16
-  %ptr = getelementptr inbounds [8 x <2 x double>], [8 x <2 x double>]* %array, i32 0, i32 %b
+  %array = alloca [4 x <2 x double>], align 16
+  %ptr = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* %array, i32 0, i32 %b
    store <2 x double> %val, <2 x double>* %ptr, align 16
    call void @llvm.amdgcn.s.barrier()
    %result = load <2 x double>, <2 x double>* %ptr, align 16
@@ -111,8 +111,8 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
  ; CI-PROMOTE: ds_read2_b64
  define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
    %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
-  %array = alloca [8 x <2 x i64>], align 16
-  %ptr = getelementptr inbounds [8 x <2 x i64>], [8 x <2 x i64>]* %array, i32 0, i32 %b
+  %array = alloca [4 x <2 x i64>], align 16
+  %ptr = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* %array, i32 0, i32 %b
    store <2 x i64> %val, <2 x i64>* %ptr, align 16
    call void @llvm.amdgcn.s.barrier()
    %result = load <2 x i64>, <2 x i64>* %ptr, align 16
diff --git a/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll

index 906a688febd240da16e64033b92846745b4c5290..71d21b120f0fa8f9ff3ec29bde131ff77d40d5cf 100644 (file)
--- a/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
+++ b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
@@ -1,6 +1,8 @@
-; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck --check-prefix=SI --check-prefix=ALL %s
+; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck --check-prefix=CI --check-prefix=ALL %s
  
-; CHECK: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
+; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
+; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
  
  define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
  entry:
@@ -22,7 +24,7 @@ entry:
    ret void
  }
  
-; CHECK: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
+; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
  
  define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
  entry:
@@ -44,7 +46,7 @@ entry:
    ret void
  }
  
-; CHECK: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
+; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
  
  define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
  entry:
@@ -66,8 +68,8 @@ entry:
    ret void
  }
  
-; CHECK-LABEL: @occupancy_0(
-; CHECK: alloca [5 x i32]
+; ALL-LABEL: @occupancy_0(
+; ALL: alloca [5 x i32]
  define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
  entry:
    %stack = alloca [5 x i32], align 4
@@ -88,8 +90,8 @@ entry:
    ret void
  }
  
-; CHECK-LABEL: @occupancy_max(
-; CHECK: alloca [5 x i32]
+; ALL-LABEL: @occupancy_max(
+; ALL: alloca [5 x i32]
  define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
  entry:
    %stack = alloca [5 x i32], align 4
@@ -110,8 +112,10 @@ entry:
    ret void
  }
  
-; CHECK-LABEL: @occupancy_6(
-; CHECK-NOT: alloca
+; SI-LABEL: @occupancy_6(
+; CI-LABEL: @occupancy_6(
+; SI: alloca
+; CI-NOT: alloca
  define void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
  entry:
    %stack = alloca [42 x i8], align 4
@@ -134,8 +138,8 @@ entry:
    ret void
  }
  
-; CHECK-LABEL: @occupancy_6_over(
-; CHECK: alloca [43 x i8]
+; ALL-LABEL: @occupancy_6_over(
+; ALL: alloca [43 x i8]
  define void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
  entry:
    %stack = alloca [43 x i8], align 4
@@ -158,8 +162,10 @@ entry:
    ret void
  }
  
-; CHECK-LABEL: @occupancy_8(
-; CHECK-NOT: alloca
+; SI-LABEL: @occupancy_8(
+; CI-LABEL: @occupancy_8(
+; SI: alloca
+; CI-NOT: alloca
  define void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
  entry:
    %stack = alloca [32 x i8], align 4
@@ -182,8 +188,8 @@ entry:
    ret void
  }
  
-; CHECK-LABEL: @occupancy_8_over(
-; CHECK: alloca [33 x i8]
+; ALL-LABEL: @occupancy_8_over(
+; ALL: alloca [33 x i8]
  define void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
  entry:
    %stack = alloca [33 x i8], align 4
@@ -206,8 +212,10 @@ entry:
    ret void
  }
  
-; CHECK-LABEL: @occupancy_9(
-; CHECK-NOT: alloca
+; SI-LABEL: @occupancy_9(
+; CI-LABEL: @occupancy_9(
+; SI: alloca
+; CI-NOT: alloca
  define void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
  entry:
    %stack = alloca [28 x i8], align 4
@@ -230,8 +238,8 @@ entry:
    ret void
  }
  
-; CHECK-LABEL: @occupancy_9_over(
-; CHECK: alloca [29 x i8]
+; ALL-LABEL: @occupancy_9_over(
+; ALL: alloca [29 x i8]
  define void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
  entry:
    %stack = alloca [29 x i8], align 4
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Wed, 1 Feb 2017 22:59:50 +0000 (22:59 +0000)
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Wed, 1 Feb 2017 22:59:50 +0000 (22:59 +0000)
lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUSubtarget.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUSubtarget.h		patch \| blob \| history
lib/Target/AMDGPU/GCNSchedStrategy.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/indirect-private-64.ll		patch \| blob \| history
test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll		patch \| blob \| history