From c3fd6a13deb2d9dd467dfcfc7e39e31bcfe835b9 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Fri, 23 Aug 2019 22:09:58 +0000 Subject: [PATCH] [AMDGPU] w/a for gfx908 mfma SrcC literal HW bug gfx908 ignores an mfma if SrcC is a literal. Differential Revision: https://reviews.llvm.org/D66670 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@369816 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPU.td | 7 +++++++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 1 + lib/Target/AMDGPU/AMDGPUSubtarget.h | 5 +++++ lib/Target/AMDGPU/SIFoldOperands.cpp | 6 +++++- lib/Target/AMDGPU/SIRegisterInfo.cpp | 10 ++++++++++ lib/Target/AMDGPU/SIRegisterInfo.h | 6 ++---- test/CodeGen/AMDGPU/agpr-register-count.ll | 2 +- test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 22 +++++++++++++++++----- 8 files changed, 48 insertions(+), 11 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 89dbfc17eca..42b477e07b3 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -154,6 +154,12 @@ def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug", "Some GFX10 bug with misaligned multi-dword LDS access in WGP mode" >; +def FeatureMFMAInlineLiteralBug : SubtargetFeature<"mfma-inline-literal-bug", + "HasMFMAInlineLiteralBug", + "true", + "MFMA cannot use inline literal as SrcC" +>; + def FeatureVcmpxPermlaneHazard : SubtargetFeature<"vcmpx-permlane-hazard", "HasVcmpxPermlaneHazard", "true", @@ -811,6 +817,7 @@ def FeatureISAVersion9_0_8 : FeatureSet< FeaturePkFmacF16Inst, FeatureAtomicFaddInsts, FeatureSRAMECC, + FeatureMFMAInlineLiteralBug, FeatureCodeObjectV3]>; def FeatureISAVersion9_0_9 : FeatureSet< diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index e2cdb659a23..3e556b46fa5 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -262,6 +262,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, AddNoCarryInsts(false), HasUnpackedD16VMem(false), LDSMisalignedBug(false), + HasMFMAInlineLiteralBug(false), ScalarizeGlobal(false), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index e64b262082c..2da723a1166 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -368,6 +368,7 @@ protected: bool CaymanISA; bool CFALUBug; bool LDSMisalignedBug; + bool HasMFMAInlineLiteralBug; bool HasVertexCache; short TexVTXClauseSize; bool ScalarizeGlobal; @@ -987,6 +988,10 @@ public: return SGPRInitBug; } + bool hasMFMAInlineLiteralBug() const { + return HasMFMAInlineLiteralBug; + } + bool has12DWordStoreHazard() const { return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; } diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index caef36c70e7..e8f7fdb57fb 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -435,7 +435,8 @@ static bool tryToFoldACImm(const SIInstrInfo *TII, OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) return false; - if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy)) { + if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) && + TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) { UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm()); return true; } @@ -481,6 +482,9 @@ static bool tryToFoldACImm(const SIInstrInfo *TII, return false; // Can only fold splat constants } + if (!TII->isOperandLegal(*UseMI, UseOpIdx, Op)) + return false; + FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op)); return true; } diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 08d7167c67f..7cc7d32dc50 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -61,6 +61,7 @@ static cl::opt EnableSpillSGPRToVGPR( SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : AMDGPURegisterInfo(), + ST(ST), SGPRPressureSets(getNumRegPressureSets()), VGPRPressureSets(getNumRegPressureSets()), AGPRPressureSets(getNumRegPressureSets()), @@ -1582,6 +1583,15 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( } } +bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { + if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST && + OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST) + return !ST.hasMFMAInlineLiteralBug(); + + return OpType >= AMDGPU::OPERAND_SRC_FIRST && + OpType <= AMDGPU::OPERAND_SRC_LAST; +} + bool SIRegisterInfo::shouldRewriteCopySrc( const TargetRegisterClass *DefRC, unsigned DefSubReg, diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index a847db98479..d861c76faf8 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -27,6 +27,7 @@ class SIMachineFunctionInfo; class SIRegisterInfo final : public AMDGPURegisterInfo { private: + const GCNSubtarget &ST; unsigned SGPRSetID; unsigned VGPRSetID; unsigned AGPRSetID; @@ -193,10 +194,7 @@ public: /// \returns True if operands defined with this operand type can accept /// an inline constant. i.e. An integer value in the range (-16, 64) or /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. - bool opCanUseInlineConstant(unsigned OpType) const { - return OpType >= AMDGPU::OPERAND_SRC_FIRST && - OpType <= AMDGPU::OPERAND_SRC_LAST; - } + bool opCanUseInlineConstant(unsigned OpType) const; unsigned findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, diff --git a/test/CodeGen/AMDGPU/agpr-register-count.ll b/test/CodeGen/AMDGPU/agpr-register-count.ll index dfedd2402a0..c2dfbe27890 100644 --- a/test/CodeGen/AMDGPU/agpr-register-count.ll +++ b/test/CodeGen/AMDGPU/agpr-register-count.ll @@ -3,7 +3,7 @@ declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) ; GCN-LABEL: {{^}}test_32_agprs: -; GCN: v_mfma_f32_32x32x1f32 a[0:31], {{v[0-9]+}}, {{v[0-9]+}}, 0 +; GCN: v_mfma_f32_32x32x1f32 a[0:31], {{v[0-9]+}}, {{v[0-9]+}}, ; GCN-NOT: v28 ; GCN: NumVgprs: 32 ; GCN: VGPRBlocks: 7 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 5ac03632fbb..00199faf04a 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,NOLIT-SRCC %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,LIT-SRCC %s declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) @@ -993,7 +994,12 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_imm_splat: ; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; GCN: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 +; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; NOLIT-SRCC: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] +; LIT-SRCC: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 ; GCN: v_accvgpr_read_b32 ; GCN: v_accvgpr_read_b32 ; GCN: v_accvgpr_read_b32 @@ -1009,7 +1015,9 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_16x16x1f32_imm_splat: ; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; GCN: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 +; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; NOLIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] +; LIT-SRCC: v_mfma_f32_16x16x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 1.0 ; GCN-DAG: v_accvgpr_read_b32 ; GCN-DAG: v_accvgpr_read_b32 ; GCN-DAG: v_accvgpr_read_b32 @@ -1040,7 +1048,9 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x8f16_imm_splat: ; GCN-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 0x40004000 ; GCN-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 0x3c003c00 -; GCN: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], 1.0 +; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0 +; NOLIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], a[{{[0-9:]+}}] +; LIT-SRCC: v_mfma_f32_32x32x8f16 a[{{[0-9]+:[0-9]+}}], v{{\[}}[[ONE]]:{{[0-9]+}}], v{{\[}}[[TWO]]:{{[0-9]+}}], 1.0 ; GCN-DAG: v_accvgpr_read_b32 ; GCN-DAG: v_accvgpr_read_b32 ; GCN-DAG: v_accvgpr_read_b32 @@ -1071,7 +1081,9 @@ bb: ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_imm_splat: ; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 -; GCN: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0 +; NOLIT-SRCC-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0 +; NOLIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9:]+}}] +; LIT-SRCC: v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], 0 ; GCN-DAG: v_accvgpr_read_b32 ; GCN-DAG: v_accvgpr_read_b32 ; GCN-DAG: v_accvgpr_read_b32 -- 2.40.0