}
}
- unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);
+ unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
+ F);
// Restrict local memory usage so that we don't drastically reduce occupancy,
// unless it is already significantly reduced.
// Round up to the next tier of usage.
unsigned MaxSizeWithWaveCount
- = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy);
+ = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
// Program is possibly broken by using more local mem than available.
if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
initializeSubtargetDependencies(TT, GPU, FS);
}
-// FIXME: These limits are for SI. Did they change with the larger maximum LDS
-// size?
-unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
- switch (NWaves) {
- case 10:
- return 1638;
- case 9:
- return 1820;
- case 8:
- return 2048;
- case 7:
- return 2340;
- case 6:
- return 2730;
- case 5:
- return 3276;
- case 4:
- return 4096;
- case 3:
- return 5461;
- case 2:
- return 8192;
- default:
+unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
+ const Function &F) const {
+ if (NWaves == 1)
return getLocalMemorySize();
- }
+ unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
+ unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+ unsigned MaxWaves = getMaxWavesPerEU();
+ return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
}
-unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
- if (Bytes <= 1638)
- return 10;
-
- if (Bytes <= 1820)
- return 9;
-
- if (Bytes <= 2048)
- return 8;
-
- if (Bytes <= 2340)
- return 7;
-
- if (Bytes <= 2730)
- return 6;
-
- if (Bytes <= 3276)
- return 5;
-
- if (Bytes <= 4096)
- return 4;
-
- if (Bytes <= 5461)
- return 3;
-
- if (Bytes <= 8192)
- return 2;
-
- return 1;
+unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
+ const Function &F) const {
+ unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
+ unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+ unsigned MaxWaves = getMaxWavesPerEU();
+ unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
+ unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
+ NumWaves = std::min(NumWaves, MaxWaves);
+ NumWaves = std::max(NumWaves, 1u);
+ return NumWaves;
}
std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
/// Return the amount of LDS that can be used that will not restrict the
/// occupancy lower than WaveCount.
- unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const;
+ unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
+ const Function &) const;
/// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
/// the given LDS memory size is the only constraint.
- unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const;
+ unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
bool hasFP16Denormals() const {
return FP64FP16Denormals;
unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs),
ST.getOccupancyWithNumVGPRs(VGPRs));
return std::min(MinRegOccupancy,
- ST.getOccupancyWithLocalMemSize(MFI->getLDSSize()));
+ ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
+ *MF.getFunction()));
}
void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
; CI-PROMOTE: ds_read_b64
define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
%val = load double, double addrspace(1)* %in, align 8
- %array = alloca [16 x double], align 8
- %ptr = getelementptr inbounds [16 x double], [16 x double]* %array, i32 0, i32 %b
+ %array = alloca [8 x double], align 8
+ %ptr = getelementptr inbounds [8 x double], [8 x double]* %array, i32 0, i32 %b
store double %val, double* %ptr, align 8
call void @llvm.amdgcn.s.barrier()
%result = load double, double* %ptr, align 8
; CI-PROMOTE: ds_read2_b64
define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
%val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
- %array = alloca [8 x <2 x double>], align 16
- %ptr = getelementptr inbounds [8 x <2 x double>], [8 x <2 x double>]* %array, i32 0, i32 %b
+ %array = alloca [4 x <2 x double>], align 16
+ %ptr = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* %array, i32 0, i32 %b
store <2 x double> %val, <2 x double>* %ptr, align 16
call void @llvm.amdgcn.s.barrier()
%result = load <2 x double>, <2 x double>* %ptr, align 16
; CI-PROMOTE: ds_read2_b64
define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
%val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
- %array = alloca [8 x <2 x i64>], align 16
- %ptr = getelementptr inbounds [8 x <2 x i64>], [8 x <2 x i64>]* %array, i32 0, i32 %b
+ %array = alloca [4 x <2 x i64>], align 16
+ %ptr = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* %array, i32 0, i32 %b
store <2 x i64> %val, <2 x i64>* %ptr, align 16
call void @llvm.amdgcn.s.barrier()
%result = load <2 x i64>, <2 x i64>* %ptr, align 16
-; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck --check-prefix=SI --check-prefix=ALL %s
+; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck --check-prefix=CI --check-prefix=ALL %s
-; CHECK: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
+; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
+; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
entry:
ret void
}
-; CHECK: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
+; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
entry:
ret void
}
-; CHECK: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
+; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
entry:
ret void
}
-; CHECK-LABEL: @occupancy_0(
-; CHECK: alloca [5 x i32]
+; ALL-LABEL: @occupancy_0(
+; ALL: alloca [5 x i32]
define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
entry:
%stack = alloca [5 x i32], align 4
ret void
}
-; CHECK-LABEL: @occupancy_max(
-; CHECK: alloca [5 x i32]
+; ALL-LABEL: @occupancy_max(
+; ALL: alloca [5 x i32]
define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
entry:
%stack = alloca [5 x i32], align 4
ret void
}
-; CHECK-LABEL: @occupancy_6(
-; CHECK-NOT: alloca
+; SI-LABEL: @occupancy_6(
+; CI-LABEL: @occupancy_6(
+; SI: alloca
+; CI-NOT: alloca
define void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
entry:
%stack = alloca [42 x i8], align 4
ret void
}
-; CHECK-LABEL: @occupancy_6_over(
-; CHECK: alloca [43 x i8]
+; ALL-LABEL: @occupancy_6_over(
+; ALL: alloca [43 x i8]
define void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
entry:
%stack = alloca [43 x i8], align 4
ret void
}
-; CHECK-LABEL: @occupancy_8(
-; CHECK-NOT: alloca
+; SI-LABEL: @occupancy_8(
+; CI-LABEL: @occupancy_8(
+; SI: alloca
+; CI-NOT: alloca
define void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
entry:
%stack = alloca [32 x i8], align 4
ret void
}
-; CHECK-LABEL: @occupancy_8_over(
-; CHECK: alloca [33 x i8]
+; ALL-LABEL: @occupancy_8_over(
+; ALL: alloca [33 x i8]
define void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
entry:
%stack = alloca [33 x i8], align 4
ret void
}
-; CHECK-LABEL: @occupancy_9(
-; CHECK-NOT: alloca
+; SI-LABEL: @occupancy_9(
+; CI-LABEL: @occupancy_9(
+; SI: alloca
+; CI-NOT: alloca
define void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
entry:
%stack = alloca [28 x i8], align 4
ret void
}
-; CHECK-LABEL: @occupancy_9_over(
-; CHECK: alloca [29 x i8]
+; ALL-LABEL: @occupancy_9_over(
+; ALL: alloca [29 x i8]
define void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
entry:
%stack = alloca [29 x i8], align 4