From 5e1edf01fd5161a38501ee43f99eb9950a1c3722 Mon Sep 17 00:00:00 2001 From: Austin Kerbow Date: Mon, 5 Aug 2019 16:09:49 +0000 Subject: [PATCH] [AMDGPU] Use S_DENORM_MODE for gfx10 Summary: During fdiv32 lowering use S_DENORM_MODE to select denorm mode in gfx10. Reviewers: arsenm, rampitec Reviewed By: arsenm, rampitec Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D65620 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@367882 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 + lib/Target/AMDGPU/AMDGPUISelLowering.h | 3 ++ lib/Target/AMDGPU/AMDGPUSubtarget.h | 5 ++ lib/Target/AMDGPU/SIISelLowering.cpp | 61 ++++++++++++++++++------ lib/Target/AMDGPU/SIInstrInfo.cpp | 1 + lib/Target/AMDGPU/SIInstrInfo.td | 7 ++- lib/Target/AMDGPU/SOPInstructions.td | 5 +- test/CodeGen/AMDGPU/fdiv.ll | 45 +++++++++++------ 8 files changed, 97 insertions(+), 31 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 90e0369f3bc..da2169a083c 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4221,6 +4221,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FRACT) NODE_NAME_CASE(SETCC) NODE_NAME_CASE(SETREG) + NODE_NAME_CASE(DENORM_MODE) NODE_NAME_CASE(FMA_W_CHAIN) NODE_NAME_CASE(FMUL_W_CHAIN) NODE_NAME_CASE(CLAMP) diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index acafd6fbe5a..421054ef783 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -369,6 +369,9 @@ enum NodeType : unsigned { // result bit per item in the wavefront. SETCC, SETREG, + + DENORM_MODE, + // FP ops with input and output chain. FMA_W_CHAIN, FMUL_W_CHAIN, diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index bc100915ac7..18d318f18ba 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -617,6 +617,11 @@ public: return getGeneration() >= AMDGPUSubtarget::GFX9; } + /// \returns If target supports S_DENORM_MODE. + bool hasDenormModeInst() const { + return getGeneration() >= AMDGPUSubtarget::GFX10; + } + bool useFlatForGlobal() const { return FlatForGlobal; } diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index f6d80eb4aa1..fd27d9def31 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7591,6 +7591,19 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); } +// Returns immediate value for setting the F32 denorm mode when using the +// S_DENORM_MODE instruction. +static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG, + const SDLoc &SL, const GCNSubtarget *ST) { + assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE"); + int DPDenormModeDefault = ST->hasFP64Denormals() + ? FP_DENORM_FLUSH_NONE + : FP_DENORM_FLUSH_IN_FLUSH_OUT; + + int Mode = SPDenormMode | (DPDenormModeDefault << 2); + return DAG.getTargetConstant(Mode, SL, MVT::i32); +} + SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) return FastLowered; @@ -7617,16 +7630,26 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); - const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16); if (!Subtarget->hasFP32Denormals()) { SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, - SL, MVT::i32); - SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, - DAG.getEntryNode(), - EnableDenormValue, BitField); + + SDValue EnableDenorm; + if (Subtarget->hasDenormModeInst()) { + const SDValue EnableDenormValue = + getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget); + + EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, + DAG.getEntryNode(), EnableDenormValue); + } else { + const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, + SL, MVT::i32); + EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, + DAG.getEntryNode(), EnableDenormValue, + BitField); + } + SDValue Ops[3] = { NegDivScale0, EnableDenorm.getValue(0), @@ -7648,19 +7671,29 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled, Mul); - SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2); + SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2); SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled, Fma3); if (!Subtarget->hasFP32Denormals()) { - const SDValue DisableDenormValue = - DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); - SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, - Fma4.getValue(1), - DisableDenormValue, - BitField, - Fma4.getValue(2)); + + SDValue DisableDenorm; + if (Subtarget->hasDenormModeInst()) { + const SDValue DisableDenormValue = + getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget); + + DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, + Fma4.getValue(1), DisableDenormValue, + Fma4.getValue(2)); + } else { + const SDValue DisableDenormValue = + DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); + + DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, + Fma4.getValue(1), DisableDenormValue, + BitField, Fma4.getValue(2)); + } SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, DisableDenorm, DAG.getRoot()); diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 85c8abe848c..77dbd239ede 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2671,6 +2671,7 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, MI.modifiesRegister(AMDGPU::EXEC, &RI) || MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || MI.getOpcode() == AMDGPU::S_SETREG_B32 || + MI.getOpcode() == AMDGPU::S_DENORM_MODE || changesVGPRIndexingMode(MI); } diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 60d9e8f60fa..425105fde54 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -266,6 +266,11 @@ def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8", [SDNPMayLoad, SDNPMemOperand, SDNPHasChain] >; +def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE", + SDTypeProfile<0 ,1, [SDTCisInt<0>]>, + [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue] +>; + //===----------------------------------------------------------------------===// // ValueType helpers //===----------------------------------------------------------------------===// @@ -689,7 +694,7 @@ def SIMM16bit : ImmLeaf ; def UIMM16bit : ImmLeaf (Imm); }] + [{return isUInt<16>(Imm);}] >; class InlineImm : PatLeaf <(vt imm), [{ diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td index 58b0c4beca2..d2d783ba76c 100644 --- a/lib/Target/AMDGPU/SOPInstructions.td +++ b/lib/Target/AMDGPU/SOPInstructions.td @@ -1168,7 +1168,10 @@ let SubtargetPredicate = isGFX10Plus in { def S_ROUND_MODE : SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">; def S_DENORM_MODE : - SOPP<0x025, (ins s16imm:$simm16), "s_denorm_mode $simm16">; + SOPP<0x025, (ins i32imm:$simm16), "s_denorm_mode $simm16", + [(SIdenorm_mode (i32 timm:$simm16))]> { + let hasSideEffects = 1; + } def S_TTRACEDATA_IMM : SOPP<0x028, (ins s16imm:$simm16), "s_ttracedata_imm $simm16">; } // End SubtargetPredicate = isGFX10Plus diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll index bc489454341..a540589ca42 100644 --- a/test/CodeGen/AMDGPU/fdiv.ll +++ b/test/CodeGen/AMDGPU/fdiv.ll @@ -1,6 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,PREGFX10,FUNC %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,PREGFX10,FUNC %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,PREGFX10,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; These tests check that fdiv is expanded correctly and also test that the @@ -17,14 +18,16 @@ ; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] ; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] -; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX10: s_denorm_mode 15 ; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 ; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] ; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] ; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] ; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] ; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] -; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; PREGFX10: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX10: s_denorm_mode 12 ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] ; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 { @@ -39,17 +42,28 @@ entry: ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS ; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] -; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] ; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] -; GCN-NOT: s_setreg -; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 -; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] -; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] -; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] -; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] -; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] -; GCN-NOT: s_setreg +; PREGFX10-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] +; PREGFX10-NOT: s_setreg +; PREGFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 +; PREGFX10: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] +; PREGFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] +; PREGFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] +; PREGFX10: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] +; PREGFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] +; PREGFX10-NOT: s_setreg + +; GFX10-NOT: s_denorm_mode +; GFX10: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 +; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]] +; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] +; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] +; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]] +; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]] +; GFX10: v_fmac_f32_e64 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]] +; GFX10-NOT: s_denorm_mode + ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] ; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], define amdgpu_kernel void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { @@ -88,7 +102,8 @@ entry: ; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] ; GCN-NOT: [[RESULT]] -; GCN-NOT: s_setreg +; PREGFX10-NOT: s_setreg +; GFX10-NOT: s_denorm_mode ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { entry: -- 2.40.0