From: Marek Olsak Date: Fri, 9 Dec 2016 19:49:48 +0000 (+0000) Subject: AMDGPU/SI: Don't reserve FLAT_SCR on non-HSA targets & without stack objects X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=36d5f19e1d12d8f98bd3f36c585cebb56c54fa1f;p=llvm AMDGPU/SI: Don't reserve FLAT_SCR on non-HSA targets & without stack objects Summary: This frees 2 scalar registers. Reviewers: tstellarAMD Subscribers: qcolombet, arsenm, kzhuravl, wdng, nhaehnle, yaxunl, tony-tye Differential Revision: https://reviews.llvm.org/D27150 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@289261 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index e711a09ccea..7b5ebc57436 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -391,7 +391,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, case AMDGPU::FLAT_SCR: case AMDGPU::FLAT_SCR_LO: case AMDGPU::FLAT_SCR_HI: - FlatUsed = true; + // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat + // instructions aren't used to access the scratch buffer. + if (MFI->hasFlatScratchInit()) + FlatUsed = true; continue; case AMDGPU::TBA: diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index bdbce8a9dac..0fdd203b3d0 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1178,11 +1178,19 @@ unsigned SIRegisterInfo::getNumAddressableSGPRs(const SISubtarget &ST) const { return 104; } -unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST) const { +unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST, + const SIMachineFunctionInfo &MFI) const { + if (MFI.hasFlatScratchInit()) { + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return 6; // FLAT_SCRATCH, XNACK, VCC (in that order) + + if (ST.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) + return 4; // FLAT_SCRATCH, VCC (in that order) + } + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return 6; // VCC, FLAT_SCRATCH, XNACK. - if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) - return 4; // VCC, FLAT_SCRATCH. + return 4; // XNACK, VCC (in that order) + return 2; // VCC. } @@ -1254,7 +1262,7 @@ unsigned SIRegisterInfo::getMaxNumSGPRs(const MachineFunction &MF) const { F, "amdgpu-num-sgpr", MaxNumSGPRs); // Make sure requested value does not violate subtarget's specifications. - if (Requested && (Requested <= getNumReservedSGPRs(ST))) + if (Requested && (Requested <= getNumReservedSGPRs(ST, MFI))) Requested = 0; // If more SGPRs are required to support the input user/system SGPRs, @@ -1283,7 +1291,8 @@ unsigned SIRegisterInfo::getMaxNumSGPRs(const MachineFunction &MF) const { if (ST.hasSGPRInitBug()) MaxNumSGPRs = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; - return std::min(MaxNumSGPRs - getNumReservedSGPRs(ST), MaxNumAddressableSGPRs); + return std::min(MaxNumSGPRs - getNumReservedSGPRs(ST, MFI), + MaxNumAddressableSGPRs); } unsigned SIRegisterInfo::getNumDebuggerReservedVGPRs( diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index ed2d7b41cc0..672df79218b 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -22,6 +22,7 @@ namespace llvm { class SISubtarget; class MachineRegisterInfo; +class SIMachineFunctionInfo; class SIRegisterInfo final : public AMDGPURegisterInfo { private: @@ -198,7 +199,8 @@ public: unsigned getNumAddressableSGPRs(const SISubtarget &ST) const; /// \returns Number of reserved SGPRs supported by the subtarget. - unsigned getNumReservedSGPRs(const SISubtarget &ST) const; + unsigned getNumReservedSGPRs(const SISubtarget &ST, + const SIMachineFunctionInfo &MFI) const; /// \returns Minimum number of SGPRs that meets given number of waves per /// execution unit requirement for given subtarget. diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll index 6b419400615..cab377feacb 100644 --- a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll +++ b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll @@ -34,9 +34,9 @@ entry: attributes #2 = {"amdgpu-flat-work-group-size"="128,128"} ; CHECK-LABEL: {{^}}min_1024_max_2048 -; CHECK: SGPRBlocks: 2 +; CHECK: SGPRBlocks: 1 ; CHECK: VGPRBlocks: 7 -; CHECK: NumSGPRsForWavesPerEU: 19 +; CHECK: NumSGPRsForWavesPerEU: 13 ; CHECK: NumVGPRsForWavesPerEU: 32 @var = addrspace(1) global float 0.0 define void @min_1024_max_2048() #3 { diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll index 149f71cead7..da49517e003 100644 --- a/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ b/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -4,7 +4,7 @@ ; If spilling to smem, additional registers are used for the resource ; descriptor. -; ALL-LABEL: {{^}}max_14_sgprs: +; ALL-LABEL: {{^}}max_12_sgprs: ; FIXME: Should be ablo to skip this copying of the private segment ; buffer because all the SGPR spills are to VGPRs. @@ -12,8 +12,8 @@ ; ALL: s_mov_b64 s[6:7], s[2:3] ; ALL: s_mov_b64 s[4:5], s[0:1] ; ALL: SGPRBlocks: 1 -; ALL: NumSGPRsForWavesPerEU: 14 -define void @max_14_sgprs(i32 addrspace(1)* %out1, +; ALL: NumSGPRsForWavesPerEU: 12 +define void @max_12_sgprs(i32 addrspace(1)* %out1, i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, @@ -35,7 +35,7 @@ define void @max_14_sgprs(i32 addrspace(1)* %out1, ; --------------------- ; total: 14 -; + reserved vcc, xnack, flat_scratch = 20 +; + reserved vcc = 16 ; Because we can't handle re-using the last few input registers as the ; special vcc etc. registers (as well as decide to not use the unused @@ -43,15 +43,15 @@ define void @max_14_sgprs(i32 addrspace(1)* %out1, ; more than expected. ; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs: -; TOSGPR: SGPRBlocks: 2 -; TOSGPR: NumSGPRsForWavesPerEU: 20 +; TOSGPR: SGPRBlocks: 1 +; TOSGPR: NumSGPRsForWavesPerEU: 16 ; TOSMEM: s_mov_b64 s[6:7], s[2:3] +; TOSMEM: s_mov_b32 s9, s13 ; TOSMEM: s_mov_b64 s[4:5], s[0:1] -; TOSMEM: s_mov_b32 s3, s13 -; TOSMEM: SGPRBlocks: 2 -; TOSMEM: NumSGPRsForWavesPerEU: 20 +; TOSMEM: SGPRBlocks: 1 +; TOSMEM: NumSGPRsForWavesPerEU: 16 define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1, i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll index 86c22540973..4f4efccc226 100644 --- a/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll +++ b/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll @@ -116,9 +116,9 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"} ; Exactly 10 waves per execution unit. ; CHECK-LABEL: {{^}}exactly_10: -; CHECK: SGPRBlocks: 2 +; CHECK: SGPRBlocks: 1 ; CHECK: VGPRBlocks: 5 -; CHECK: NumSGPRsForWavesPerEU: 19 +; CHECK: NumSGPRsForWavesPerEU: 13 ; CHECK: NumVGPRsForWavesPerEU: 24 define void @exactly_10() #9 { %val0 = load volatile float, float addrspace(1)* @var diff --git a/test/CodeGen/AMDGPU/exceed-max-sgprs.ll b/test/CodeGen/AMDGPU/exceed-max-sgprs.ll index d92efdc6f2d..8ef54b9e95d 100644 --- a/test/CodeGen/AMDGPU/exceed-max-sgprs.ll +++ b/test/CodeGen/AMDGPU/exceed-max-sgprs.ll @@ -38,7 +38,7 @@ define void @use_too_many_sgprs_bonaire() #1 { ret void } -; ERROR: error: scalar registers limit of 104 exceeded (108) in use_too_many_sgprs_bonaire_flat_scr +; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire_flat_scr define void @use_too_many_sgprs_bonaire_flat_scr() #1 { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll index d8807e638fb..1ee9100c2eb 100644 --- a/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -1,18 +1,20 @@ -; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=NOXNACK -check-prefix=CI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=NOXNACK -check-prefix=VI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=XNACK -check-prefix=VI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefix=XNACK -check-prefix=VI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=NOXNACK -check-prefix=HSA-NOXNACK -check-prefix=HSA -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=NOXNACK -check-prefix=HSA-NOXNACK -check-prefix=HSA -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=XNACK -check-prefix=HSA-XNACK -check-prefix=HSA -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-CI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-NOXNACK -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-XNACK -check-prefix=GCN %s ; GCN-LABEL: {{^}}no_vcc_no_flat: -; HSA-NOXNACK: is_xnack_enabled = 0 -; HSA-XNACK: is_xnack_enabled = 1 +; HSA-CI: is_xnack_enabled = 0 +; HSA-VI-NOXNACK: is_xnack_enabled = 0 +; HSA-VI-XNACK: is_xnack_enabled = 1 -; NOXNACK: ; NumSgprs: 8 -; XNACK: ; NumSgprs: 12 +; CI: ; NumSgprs: 8 +; VI-NOXNACK: ; NumSgprs: 8 +; VI-XNACK: ; NumSgprs: 12 define void @no_vcc_no_flat() { entry: call void asm sideeffect "", "~{SGPR7}"() @@ -20,11 +22,13 @@ entry: } ; GCN-LABEL: {{^}}vcc_no_flat: -; HSA-NOXNACK: is_xnack_enabled = 0 -; HSA-XNACK: is_xnack_enabled = 1 +; HSA-CI: is_xnack_enabled = 0 +; HSA-VI-NOXNACK: is_xnack_enabled = 0 +; HSA-VI-XNACK: is_xnack_enabled = 1 -; NOXNACK: ; NumSgprs: 10 -; XNACK: ; NumSgprs: 12 +; CI: ; NumSgprs: 10 +; VI-NOXNACK: ; NumSgprs: 10 +; VI-XNACK: ; NumSgprs: 12 define void @vcc_no_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{VCC}"() @@ -32,11 +36,16 @@ entry: } ; GCN-LABEL: {{^}}no_vcc_flat: -; HSA-NOXNACK: is_xnack_enabled = 0 -; HSA-XNACK: is_xnack_enabled = 1 +; HSA-CI: is_xnack_enabled = 0 +; HSA-VI-NOXNACK: is_xnack_enabled = 0 +; HSA-VI-XNACK: is_xnack_enabled = 1 -; CI: ; NumSgprs: 12 -; VI: ; NumSgprs: 14 +; CI: ; NumSgprs: 8 +; VI-NOXNACK: ; NumSgprs: 8 +; VI-XNACK: ; NumSgprs: 12 +; HSA-CI: ; NumSgprs: 8 +; HSA-VI-NOXNACK: ; NumSgprs: 8 +; HSA-VI-XNACK: ; NumSgprs: 12 define void @no_vcc_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"() @@ -47,8 +56,12 @@ entry: ; HSA-NOXNACK: is_xnack_enabled = 0 ; HSA-XNACK: is_xnack_enabled = 1 -; CI: ; NumSgprs: 12 -; VI: ; NumSgprs: 14 +; CI: ; NumSgprs: 10 +; VI-NOXNACK: ; NumSgprs: 10 +; VI-XNACK: ; NumSgprs: 12 +; HSA-CI: ; NumSgprs: 10 +; HSA-VI-NOXNACK: ; NumSgprs: 10 +; HSA-VI-XNACK: ; NumSgprs: 12 define void @vcc_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"() diff --git a/test/CodeGen/AMDGPU/spill-m0.ll b/test/CodeGen/AMDGPU/spill-m0.ll index 24b0a81c18e..8c16b9d1649 100644 --- a/test/CodeGen/AMDGPU/spill-m0.ll +++ b/test/CodeGen/AMDGPU/spill-m0.ll @@ -7,7 +7,8 @@ ; XXX - Why does it like to use vcc? ; GCN-LABEL: {{^}}spill_m0: -; TOSMEM: s_mov_b32 s84, SCRATCH_RSRC_DWORD0 +; TOSMEM: s_mov_b32 s[[LO:[0-9]+]], SCRATCH_RSRC_DWORD0 +; TOSMEM: s_mov_b32 s[[HI:[0-9]+]], 0xe80000 ; GCN-DAG: s_cmp_lg_u32 @@ -22,7 +23,7 @@ ; TOSMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0 ; TOSMEM: s_mov_b32 m0, s3{{$}} ; TOSMEM-NOT: [[M0_COPY]] -; TOSMEM: s_buffer_store_dword [[M0_COPY]], s[84:87], m0 ; 4-byte Folded Spill +; TOSMEM: s_buffer_store_dword [[M0_COPY]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Spill ; TOSMEM: s_waitcnt lgkmcnt(0) ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] @@ -37,7 +38,7 @@ ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]] ; TOSMEM: s_mov_b32 m0, s3{{$}} -; TOSMEM: s_buffer_load_dword [[M0_RESTORE:s[0-9]+]], s[84:87], m0 ; 4-byte Folded Reload +; TOSMEM: s_buffer_load_dword [[M0_RESTORE:s[0-9]+]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Reload ; TOSMEM-NOT: [[M0_RESTORE]] ; TOSMEM: s_mov_b32 m0, [[M0_RESTORE]] @@ -161,10 +162,10 @@ endif: ; TOSMEM: s_cmp_eq_u32 ; TOSMEM-NOT: m0 ; TOSMEM: s_mov_b32 m0, s3 -; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[84:87], m0 ; 8-byte Folded Spill +; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill ; TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s3, 0x200 -; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[84:87], m0 ; 4-byte Folded Spill +; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill ; TOSMEM-NOT: m0 ; TOSMEM: s_cbranch_scc1 @@ -172,7 +173,7 @@ endif: ; TOSMEM: s_mov_b32 vcc_hi, m0 ; TOSMEM: s_mov_b32 m0, s3 -; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[84:87], m0 ; 8-byte Folded Reload +; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload ; TOSMEM: s_mov_b32 m0, vcc_hi ; TOSMEM: s_waitcnt lgkmcnt(0) @@ -180,7 +181,7 @@ endif: ; TOSMEM-NOT: m0 ; TOSMEM: s_add_u32 m0, s3, 0x200 -; TOSMEM: s_buffer_load_dword s0, s[84:87], m0 ; 4-byte Folded Reload +; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload ; TOSMEM-NOT: m0 ; TOSMEM: s_waitcnt lgkmcnt(0) ; TOSMEM-NOT: m0