[AMDGPU] Fix MaxWorkGroupsPerCU for large workgroups

author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Wed, 15 Feb 2017 01:03:59 +0000 (01:03 +0000)

committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Wed, 15 Feb 2017 01:03:59 +0000 (01:03 +0000)
author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Wed, 15 Feb 2017 01:03:59 +0000 (01:03 +0000)
committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Wed, 15 Feb 2017 01:03:59 +0000 (01:03 +0000)
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

index 02e47afeb91c4214d771e3331378f10b43e67ca6..0a1ab73d8dcfda545f321fc05d041e346dbb3cb0 100644 (file)
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -151,7 +151,11 @@ unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
                                 unsigned FlatWorkGroupSize) {
    if (!Features.test(FeatureGCN))
      return 8;
-  return getWavesPerWorkGroup(Features, FlatWorkGroupSize) == 1 ? 40 : 16;
+  unsigned N = getWavesPerWorkGroup(Features, FlatWorkGroupSize);
+  if (N == 1)
+    return 40;
+  N = 40 / N;
+  return std::min(N, 16u);
  }
  
  unsigned getMaxWavesPerCU(const FeatureBitset &Features) {
diff --git a/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll

index 71d21b120f0fa8f9ff3ec29bde131ff77d40d5cf..93b33736b2a044f76288ebc0794c1db4cf49f4e1 100644 (file)
--- a/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
+++ b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
@@ -69,7 +69,8 @@ entry:
  }
  
  ; ALL-LABEL: @occupancy_0(
-; ALL: alloca [5 x i32]
+; CI-NOT: alloca [5 x i32]
+; SI: alloca [5 x i32]
  define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
  entry:
    %stack = alloca [5 x i32], align 4
@@ -91,7 +92,8 @@ entry:
  }
  
  ; ALL-LABEL: @occupancy_max(
-; ALL: alloca [5 x i32]
+; CI-NOT: alloca [5 x i32]
+; SI: alloca [5 x i32]
  define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
  entry:
    %stack = alloca [5 x i32], align 4
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Wed, 15 Feb 2017 01:03:59 +0000 (01:03 +0000)
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Wed, 15 Feb 2017 01:03:59 +0000 (01:03 +0000)
lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll		patch \| blob \| history