From 237ec36765b3bb486cc9d0957f00e7552c9ddedd Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Wed, 15 Feb 2017 01:03:59 +0000
Subject: [PATCH] [AMDGPU] Fix MaxWorkGroupsPerCU for large workgroups

This patch corrects the maximum workgroups per CU if we have big
workgroups (more than 128). This calculation contributes to the
occupancy calculation in respect to LDS size.

Differential Revision: https://reviews.llvm.org/D29974

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295134 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp             | 6 +++++-
 test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll | 6 ++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 02e47afeb91..0a1ab73d8dc 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -151,7 +151,11 @@ unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
                                unsigned FlatWorkGroupSize) {
   if (!Features.test(FeatureGCN))
     return 8;
-  return getWavesPerWorkGroup(Features, FlatWorkGroupSize) == 1 ? 40 : 16;
+  unsigned N = getWavesPerWorkGroup(Features, FlatWorkGroupSize);
+  if (N == 1)
+    return 40;
+  N = 40 / N;
+  return std::min(N, 16u);
 }
 
 unsigned getMaxWavesPerCU(const FeatureBitset &Features) {
diff --git a/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
index 71d21b120f0..93b33736b2a 100644
--- a/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
+++ b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
@@ -69,7 +69,8 @@ entry:
 }
 
 ; ALL-LABEL: @occupancy_0(
-; ALL: alloca [5 x i32]
+; CI-NOT: alloca [5 x i32]
+; SI: alloca [5 x i32]
 define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
 entry:
   %stack = alloca [5 x i32], align 4
@@ -91,7 +92,8 @@ entry:
 }
 
 ; ALL-LABEL: @occupancy_max(
-; ALL: alloca [5 x i32]
+; CI-NOT: alloca [5 x i32]
+; SI: alloca [5 x i32]
 define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
 entry:
   %stack = alloca [5 x i32], align 4
-- 
2.50.1