From: Hans Wennborg Date: Wed, 10 Aug 2016 16:45:40 +0000 (+0000) Subject: Merging r276051 and r276823: X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3248ce68a151b2768bfcc85a4da5cb8ec1060244;p=llvm Merging r276051 and r276823: ------------------------------------------------------------------------ r276051 | arsenm | 2016-07-19 16:16:53 -0700 (Tue, 19 Jul 2016) | 8 lines AMDGPU: Change fdiv lowering based on !fpmath metadata If 2.5 ulp is acceptable, denormals are not required, and isn't a reciprocal which will already be handled, replace with a faster fdiv. Simplify the lowering tests by using per function subtarget features. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r276823 | arsenm | 2016-07-26 16:25:44 -0700 (Tue, 26 Jul 2016) | 4 lines AMDGPU: Use rcp for fdiv 1, x with fpmath metadata Using rcp should be OK for safe math usually, so this should not be replacing the original fdiv. ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_39@278243 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 7e59710a427..d4784b5463d 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -20,6 +20,7 @@ class AMDGPUInstrPrinter; class AMDGPUSubtarget; class AMDGPUTargetMachine; class FunctionPass; +class GCNTargetMachine; struct MachineSchedContext; class MCAsmInfo; class raw_ostream; @@ -50,7 +51,7 @@ FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIDebuggerInsertNopsPass(); FunctionPass *createSIInsertWaitsPass(); -FunctionPass *createAMDGPUCodeGenPreparePass(const TargetMachine *TM = nullptr); +FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr); ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C); diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 3b415774df4..b955e231699 100644 --- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -14,7 +14,9 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" #include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/CodeGen/Passes.h" @@ -30,15 +32,28 @@ using namespace llvm; namespace { class AMDGPUCodeGenPrepare : public FunctionPass, - public InstVisitor { + public InstVisitor { + const GCNTargetMachine *TM; + const SISubtarget *ST; DivergenceAnalysis *DA; - const TargetMachine *TM; + Module *Mod; + bool HasUnsafeFPMath; public: static char ID; AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : FunctionPass(ID), - TM(TM) { } + TM(static_cast(TM)), + ST(nullptr), + DA(nullptr), + Mod(nullptr), + HasUnsafeFPMath(false) { } + + bool visitFDiv(BinaryOperator &I); + + bool visitInstruction(Instruction &I) { + return false; + } bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; @@ -55,7 +70,92 @@ public: } // End anonymous namespace +static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { + const ConstantFP *CNum = dyn_cast(Num); + if (!CNum) + return false; + + // Reciprocal f32 is handled separately without denormals. + return UnsafeDiv || CNum->isExactlyValue(+1.0); +} + +// Insert an intrinsic for fast fdiv for safe math situations where we can +// reduce precision. Leave fdiv for situations where the generic node is +// expected to be optimized. +bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { + Type *Ty = FDiv.getType(); + + // TODO: Handle half + if (!Ty->getScalarType()->isFloatTy()) + return false; + + MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); + if (!FPMath) + return false; + + const FPMathOperator *FPOp = cast(&FDiv); + float ULP = FPOp->getFPAccuracy(); + if (ULP < 2.5f) + return false; + + FastMathFlags FMF = FPOp->getFastMathFlags(); + bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || + FMF.allowReciprocal(); + if (ST->hasFP32Denormals() && !UnsafeDiv) + return false; + + IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); + Builder.setFastMathFlags(FMF); + Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); + + const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); + Function *Decl + = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); + + Value *Num = FDiv.getOperand(0); + Value *Den = FDiv.getOperand(1); + + Value *NewFDiv = nullptr; + + if (VectorType *VT = dyn_cast(Ty)) { + NewFDiv = UndefValue::get(VT); + + // FIXME: Doesn't do the right thing for cases where the vector is partially + // constant. This works when the scalarizer pass is run first. + for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { + Value *NumEltI = Builder.CreateExtractElement(Num, I); + Value *DenEltI = Builder.CreateExtractElement(Den, I); + Value *NewElt; + + if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { + NewElt = Builder.CreateFDiv(NumEltI, DenEltI); + } else { + NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); + } + + NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); + } + } else { + if (!shouldKeepFDivF32(Num, UnsafeDiv)) + NewFDiv = Builder.CreateCall(Decl, { Num, Den }); + } + + if (NewFDiv) { + FDiv.replaceAllUsesWith(NewFDiv); + NewFDiv->takeName(&FDiv); + FDiv.eraseFromParent(); + } + + return true; +} + +static bool hasUnsafeFPMath(const Function &F) { + Attribute Attr = F.getFnAttribute("unsafe-fp-math"); + return Attr.getValueAsString() == "true"; +} + bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { + Mod = &M; return false; } @@ -63,10 +163,21 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { if (!TM || skipFunction(F)) return false; + ST = &TM->getSubtarget(F); DA = &getAnalysis(); - visit(F); + HasUnsafeFPMath = hasUnsafeFPMath(F); - return true; + bool MadeChange = false; + + for (BasicBlock &BB : F) { + BasicBlock::iterator Next; + for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { + Next = std::next(I); + MadeChange |= visit(*I); + } + } + + return MadeChange; } INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, @@ -77,6 +188,6 @@ INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, char AMDGPUCodeGenPrepare::ID = 0; -FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const TargetMachine *TM) { +FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) { return new AMDGPUCodeGenPrepare(TM); } diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp index 791872a9db4..8e3471bd208 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp @@ -29,16 +29,39 @@ static const char *const IntrinsicNameTable[] = { #undef GET_INTRINSIC_NAME_TABLE }; -std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, - unsigned numTys) const { - if (IntrID < Intrinsic::num_intrinsics) { - return nullptr; - } +namespace { +#define GET_INTRINSIC_ATTRIBUTES +#include "AMDGPUGenIntrinsics.inc" +#undef GET_INTRINSIC_ATTRIBUTES +} + +StringRef AMDGPUIntrinsicInfo::getName(unsigned IntrID, + ArrayRef Tys) const { + if (IntrID < Intrinsic::num_intrinsics) + return StringRef(); + assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics && "Invalid intrinsic ID"); - std::string Result(IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]); - return Result; + return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]; +} + +std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys, + unsigned NumTys) const { + return getName(IntrID, makeArrayRef(Tys, NumTys)).str(); +} + +FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID, + ArrayRef Tys) const { + // FIXME: Re-use Intrinsic::getType machinery + switch (ID) { + case AMDGPUIntrinsic::amdgcn_fdiv_fast: { + Type *F32Ty = Type::getFloatTy(Context); + return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false); + } + default: + llvm_unreachable("unhandled intrinsic"); + } } unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData, @@ -68,8 +91,20 @@ bool AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const { #undef GET_INTRINSIC_OVERLOAD_TABLE } +Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, + ArrayRef Tys) const { + FunctionType *FTy = getType(M->getContext(), IntrID, Tys); + Function *F + = cast(M->getOrInsertFunction(getName(IntrID, Tys), FTy)); + + AttributeSet AS = getAttributes(M->getContext(), + static_cast(IntrID)); + F->setAttributes(AS); + return F; +} + Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, Type **Tys, - unsigned numTys) const { - llvm_unreachable("Not implemented"); + unsigned NumTys) const { + return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys)); } diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h index f4173929259..6cb8b964464 100644 --- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h +++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.h @@ -34,13 +34,23 @@ enum ID { class AMDGPUIntrinsicInfo final : public TargetIntrinsicInfo { public: AMDGPUIntrinsicInfo(); + + StringRef getName(unsigned IntrId, ArrayRef Tys = None) const; + std::string getName(unsigned IntrId, Type **Tys = nullptr, - unsigned numTys = 0) const override; + unsigned NumTys = 0) const override; + unsigned lookupName(const char *Name, unsigned Len) const override; bool isOverloaded(unsigned IID) const override; Function *getDeclaration(Module *M, unsigned ID, Type **Tys = nullptr, - unsigned numTys = 0) const override; + unsigned NumTys = 0) const override; + + Function *getDeclaration(Module *M, unsigned ID, + ArrayRef = None) const; + + FunctionType *getType(LLVMContext &Context, unsigned ID, + ArrayRef Tys = None) const; }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 3e53f52c689..b2d4e1144c7 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -309,6 +309,7 @@ public: ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override; + void addIRPasses() override; bool addPreISel() override; void addMachineSSAOptimization() override; bool addInstSelector() override; @@ -499,6 +500,13 @@ void GCNPassConfig::addMachineSSAOptimization() { addPass(&DeadMachineInstructionElimID); } +void GCNPassConfig::addIRPasses() { + // TODO: May want to move later or split into an early and late one. + addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine())); + + AMDGPUPassConfig::addIRPasses(); +} + bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); addPass(createSILowerI1CopiesPass()); diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index aa1d009b396..80d44351267 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1792,6 +1792,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL, Op->getVTList(), Ops, VT, MMO); } + case AMDGPUIntrinsic::amdgcn_fdiv_fast: { + return lowerFDIV_FAST(Op, DAG); + } case AMDGPUIntrinsic::SI_vs_load_input: return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT, Op.getOperand(1), @@ -2098,7 +2101,8 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // Catch division cases where we can use shortcuts with rcp and rsq // instructions. -SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { +SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, + SelectionDAG &DAG) const { SDLoc SL(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -2139,47 +2143,48 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { - if (SDValue FastLowered = LowerFastFDIV(Op, DAG)) - return FastLowered; - +// Faster 2.5 ULP division that does not support denormals. +SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); - // faster 2.5 ulp fdiv when using -amdgpu-fast-fdiv flag - if (EnableAMDGPUFastFDIV) { - // This does not support denormals. - SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); + SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS); - const APFloat K0Val(BitsToFloat(0x6f800000)); - const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); + const APFloat K0Val(BitsToFloat(0x6f800000)); + const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32); - const APFloat K1Val(BitsToFloat(0x2f800000)); - const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); + const APFloat K1Val(BitsToFloat(0x2f800000)); + const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32); - const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); + const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); - EVT SetCCVT = - getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); + EVT SetCCVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32); - SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); + SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT); - SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); - // TODO: Should this propagate fast-math-flags? + // TODO: Should this propagate fast-math-flags? + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); - r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); + // rcp does not support denormals. + SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); - // rcp does not support denormals. - SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); + SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0); + return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); +} - return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); - } +SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { + if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) + return FastLowered; + + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); - // Generates more precise fpdiv32. const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32); SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); @@ -2209,7 +2214,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { if (DAG.getTarget().Options.UnsafeFPMath) - return LowerFastFDIV(Op, DAG); + return lowerFastUnsafeFDIV(Op, DAG); SDLoc SL(Op); SDValue X = Op.getOperand(0); diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 8e055eea58c..1d349faa592 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -36,7 +36,8 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td index a9b7c39096e..9d06ccfc6c7 100644 --- a/lib/Target/AMDGPU/SIIntrinsics.td +++ b/lib/Target/AMDGPU/SIIntrinsics.td @@ -7,7 +7,8 @@ // //===----------------------------------------------------------------------===// // -// SI Intrinsic Definitions +// Backend internal SI Intrinsic Definitions. User code should not +// directly use these. // //===----------------------------------------------------------------------===// @@ -177,6 +178,12 @@ let TargetPrefix = "SI", isTarget = 1 in { } // End TargetPrefix = "SI", isTarget = 1 let TargetPrefix = "amdgcn", isTarget = 1 in { + // Emit 2.5 ulp, no denormal division. Should only be inserted by + // pass based on !fpmath metadata. + def int_amdgcn_fdiv_fast : Intrinsic< + [llvm_float_ty], [llvm_float_ty], [IntrNoMem] + >; + /* Control flow Intrinsics */ def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll index a12132f425d..d78c75165be 100644 --- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll +++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll @@ -1,8 +1,246 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare < %s | FileCheck %s -; RUN: opt -S -amdgpu-codegenprepare < %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare %s | FileCheck %s +; RUN: opt -S -amdgpu-codegenprepare %s | FileCheck -check-prefix=NOOP %s ; Make sure this doesn't crash with no triple -; CHECK-LABEL: @foo( -define void @foo() { +; NOOP-LABEL: @noop_fdiv_fpmath( +; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0 +define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 { + %md.25ulp = fdiv float %a, %b, !fpmath !0 + store volatile float %md.25ulp, float addrspace(1)* %out ret void } + +; CHECK-LABEL: @fdiv_fpmath( +; CHECK: %no.md = fdiv float %a, %b{{$}} +; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1 +; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2 +; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 +; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3 +; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 +; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 +define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 { + %no.md = fdiv float %a, %b + store volatile float %no.md, float addrspace(1)* %out + + %md.half.ulp = fdiv float %a, %b, !fpmath !1 + store volatile float %md.half.ulp, float addrspace(1)* %out + + %md.1ulp = fdiv float %a, %b, !fpmath !2 + store volatile float %md.1ulp, float addrspace(1)* %out + + %md.25ulp = fdiv float %a, %b, !fpmath !0 + store volatile float %md.25ulp, float addrspace(1)* %out + + %md.3ulp = fdiv float %a, %b, !fpmath !3 + store volatile float %md.3ulp, float addrspace(1)* %out + + %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0 + store volatile float %fast.md.25ulp, float addrspace(1)* %out + + %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0 + store volatile float %arcp.md.25ulp, float addrspace(1)* %out + + ret void +} + +; CHECK-LABEL: @rcp_fdiv_fpmath( +; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}} +; CHECK: %md.25ulp = fdiv float 1.000000e+00, %x, !fpmath !0 +; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1 +; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x{{$}} +; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0 +; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}} +; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0 +define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 { + %no.md = fdiv float 1.0, %x + store volatile float %no.md, float addrspace(1)* %out + + %md.25ulp = fdiv float 1.0, %x, !fpmath !0 + store volatile float %md.25ulp, float addrspace(1)* %out + + %md.half.ulp = fdiv float 1.0, %x, !fpmath !1 + store volatile float %md.half.ulp, float addrspace(1)* %out + + %arcp.no.md = fdiv arcp float 1.0, %x + store volatile float %arcp.no.md, float addrspace(1)* %out + + %arcp.25ulp = fdiv arcp float 1.0, %x, !fpmath !0 + store volatile float %arcp.25ulp, float addrspace(1)* %out + + %fast.no.md = fdiv fast float 1.0, %x + store volatile float %fast.no.md, float addrspace(1)* %out + + %fast.25ulp = fdiv fast float 1.0, %x, !fpmath !0 + store volatile float %fast.25ulp, float addrspace(1)* %out + + ret void +} + +; CHECK-LABEL: @fdiv_fpmath_vector( +; CHECK: %no.md = fdiv <2 x float> %a, %b{{$}} +; CHECK: %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1 +; CHECK: %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2 + +; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0 +; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0 +; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]]), !fpmath !0 +; CHECK: %[[INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FDIV0]], i64 0 +; CHECK: %[[A1:[0-9]+]] = extractelement <2 x float> %a, i64 1 +; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1 +; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0 +; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1 +define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 { + %no.md = fdiv <2 x float> %a, %b + store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out + + %md.half.ulp = fdiv <2 x float> %a, %b, !fpmath !1 + store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out + + %md.1ulp = fdiv <2 x float> %a, %b, !fpmath !2 + store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out + + %md.25ulp = fdiv <2 x float> %a, %b, !fpmath !0 + store volatile <2 x float> %md.25ulp, <2 x float> addrspace(1)* %out + + ret void +} + +; CHECK-LABEL: @rcp_fdiv_fpmath_vector( +; CHECK: %no.md = fdiv <2 x float> , %x{{$}} +; CHECK: %md.half.ulp = fdiv <2 x float> , %x, !fpmath !1 +; CHECK: %arcp.no.md = fdiv arcp <2 x float> , %x{{$}} +; CHECK: %fast.no.md = fdiv fast <2 x float> , %x{{$}} + +; CHECK: extractelement <2 x float> %x +; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 +; CHECK: extractelement <2 x float> %x +; CHECK: fdiv arcp float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 +; CHECK: store volatile <2 x float> %arcp.25ulp + +; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 +; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0 +; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out +define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { + %no.md = fdiv <2 x float> , %x + store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out + + %md.half.ulp = fdiv <2 x float> , %x, !fpmath !1 + store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out + + %arcp.no.md = fdiv arcp <2 x float> , %x + store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out + + %fast.no.md = fdiv fast <2 x float> , %x + store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out + + %arcp.25ulp = fdiv arcp <2 x float> , %x, !fpmath !0 + store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out + + %fast.25ulp = fdiv fast <2 x float> , %x, !fpmath !0 + store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out + + ret void +} + +; CHECK-LABEL: @rcp_fdiv_fpmath_vector_nonsplat( +; CHECK: %no.md = fdiv <2 x float> , %x +; CHECK: %arcp.no.md = fdiv arcp <2 x float> , %x +; CHECK: %fast.no.md = fdiv fast <2 x float> , %x{{$}} + +; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0 +; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: fdiv arcp float 2.000000e+00, %[[X1]], !fpmath !0 +; CHECK: store volatile <2 x float> %arcp.25ulp + +; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0 +; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0 +; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1 +; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0 +; CHECK: store volatile <2 x float> %fast.25ulp +define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { + %no.md = fdiv <2 x float> , %x + store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out + + %arcp.no.md = fdiv arcp <2 x float> , %x + store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out + + %fast.no.md = fdiv fast <2 x float> , %x + store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out + + %arcp.25ulp = fdiv arcp <2 x float> , %x, !fpmath !0 + store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out + + %fast.25ulp = fdiv fast <2 x float> , %x, !fpmath !0 + store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out + + ret void +} + +; FIXME: Should be able to get fdiv for 1.0 component +; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant( +; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 +; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 +; CHECK: store volatile <2 x float> %arcp.25ulp + +; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 +; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0 +; CHECK: store volatile <2 x float> %fast.25ulp +define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 { + %x.insert = insertelement <2 x float> %x, float 1.0, i32 0 + + %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0 + store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out + + %fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0 + store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out + + ret void +} + +; CHECK-LABEL: @fdiv_fpmath_f32_denormals( +; CHECK: %no.md = fdiv float %a, %b{{$}} +; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1 +; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2 +; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0 +; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3 +; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 +; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0 +define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { + %no.md = fdiv float %a, %b + store volatile float %no.md, float addrspace(1)* %out + + %md.half.ulp = fdiv float %a, %b, !fpmath !1 + store volatile float %md.half.ulp, float addrspace(1)* %out + + %md.1ulp = fdiv float %a, %b, !fpmath !2 + store volatile float %md.1ulp, float addrspace(1)* %out + + %md.25ulp = fdiv float %a, %b, !fpmath !0 + store volatile float %md.25ulp, float addrspace(1)* %out + + %md.3ulp = fdiv float %a, %b, !fpmath !3 + store volatile float %md.3ulp, float addrspace(1)* %out + + %fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0 + store volatile float %fast.md.25ulp, float addrspace(1)* %out + + %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0 + store volatile float %arcp.md.25ulp, float addrspace(1)* %out + + ret void +} + +attributes #0 = { nounwind optnone noinline } +attributes #1 = { nounwind } +attributes #2 = { nounwind "target-features"="+fp32-denormals" } + +; CHECK: !0 = !{float 2.500000e+00} +; CHECK: !1 = !{float 5.000000e-01} +; CHECK: !2 = !{float 1.000000e+00} +; CHECK: !3 = !{float 3.000000e+00} + +!0 = !{float 2.500000e+00} +!1 = !{float 5.000000e-01} +!2 = !{float 1.000000e+00} +!3 = !{float 3.000000e+00} diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll index 4021233e778..65464cdba60 100644 --- a/test/CodeGen/AMDGPU/fdiv.ll +++ b/test/CodeGen/AMDGPU/fdiv.ll @@ -1,8 +1,4 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -amdgpu-fast-fdiv < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=I754 -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=UNSAFE-FP -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; These tests check that fdiv is expanded correctly and also test that the @@ -15,22 +11,59 @@ ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_mul_f32_e32 +; SI: v_div_scale_f32 +; SI-DAG: v_div_scale_f32 ; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_mul_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_fma_f32 +; SI: v_div_fmas_f32 +; SI: v_div_fixup_f32 +define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 { +entry: + %fdiv = fdiv float %a, %b + store float %fdiv, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}fdiv_25ulp_f32: +; SI: v_cndmask_b32 +; SI: v_mul_f32 +; SI: v_rcp_f32 +; SI: v_mul_f32 +; SI: v_mul_f32 +define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 { +entry: + %fdiv = fdiv float %a, %b, !fpmath !0 + store float %fdiv, float addrspace(1)* %out + ret void +} + +; Use correct fdiv +; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32: +; SI: v_fma_f32 +; SI: v_div_fmas_f32 +; SI: v_div_fixup_f32 +define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { +entry: + %fdiv = fdiv float %a, %b, !fpmath !0 + store float %fdiv, float addrspace(1)* %out + ret void +} -; I754-DAG: v_div_scale_f32 -; I754-DAG: v_rcp_f32 -; I754-DAG: v_fma_f32 -; I754-DAG: v_mul_f32 -; I754-DAG: v_fma_f32 -; I754-DAG: v_div_fixup_f32 -define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) { +; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32: +; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} +; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] +; SI-NOT: [[RESULT]] +; SI: buffer_store_dword [[RESULT]] +define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { entry: - %0 = fdiv float %a, %b - store float %0, float addrspace(1)* %out + %fdiv = fdiv fast float %a, %b + store float %fdiv, float addrspace(1)* %out ret void } @@ -38,15 +71,14 @@ entry: ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_mul_f32_e32 - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) { +; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} +; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] +; SI-NOT: [[RESULT]] +; SI: buffer_store_dword [[RESULT]] +define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 { entry: - %0 = fdiv fast float %a, %b - store float %0, float addrspace(1)* %out + %fdiv = fdiv fast float %a, %b + store float %fdiv, float addrspace(1)* %out ret void } @@ -54,15 +86,14 @@ entry: ; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_mul_f32_e32 - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) { +; SI: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} +; SI: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] +; SI-NOT: [[RESULT]] +; SI: buffer_store_dword [[RESULT]] +define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 { entry: - %0 = fdiv arcp float %a, %b - store float %0, float addrspace(1)* %out + %fdiv = fdiv arcp float %a, %b + store float %fdiv, float addrspace(1)* %out ret void } @@ -72,26 +103,24 @@ entry: ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 +define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { +entry: + %fdiv = fdiv <2 x float> %a, %b + store <2 x float> %fdiv, <2 x float> addrspace(1)* %out + ret void +} -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_fixup_f32 -; I754: v_div_fixup_f32 -define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32: +; SI: v_cmp_gt_f32 +; SI: v_cmp_gt_f32 +define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: - %0 = fdiv <2 x float> %a, %b - store <2 x float> %0, <2 x float> addrspace(1)* %out + %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 + store <2 x float> %fdiv, <2 x float> addrspace(1)* %out ret void } @@ -101,19 +130,12 @@ entry: ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +; SI: v_rcp_f32 +; SI: v_rcp_f32 +define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: - %0 = fdiv fast <2 x float> %a, %b - store <2 x float> %0, <2 x float> addrspace(1)* %out + %fdiv = fdiv fast <2 x float> %a, %b + store <2 x float> %fdiv, <2 x float> addrspace(1)* %out ret void } @@ -123,19 +145,12 @@ entry: ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_rcp_f32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) { +; SI: v_rcp_f32 +; SI: v_rcp_f32 +define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { entry: - %0 = fdiv arcp <2 x float> %a, %b - store <2 x float> %0, <2 x float> addrspace(1)* %out + %fdiv = fdiv arcp <2 x float> %a, %b + store <2 x float> %fdiv, <2 x float> addrspace(1)* %out ret void } @@ -149,37 +164,11 @@ entry: ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 - -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_scale_f32 -; I754: v_div_fixup_f32 -; I754: v_div_fixup_f32 -; I754: v_div_fixup_f32 -; I754: v_div_fixup_f32 -define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +; SI: v_div_fixup_f32 +; SI: v_div_fixup_f32 +; SI: v_div_fixup_f32 +; SI: v_div_fixup_f32 +define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr @@ -198,24 +187,11 @@ define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1) ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +; SI: v_rcp_f32 +; SI: v_rcp_f32 +; SI: v_rcp_f32 +; SI: v_rcp_f32 +define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr @@ -234,24 +210,11 @@ define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> ad ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS ; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_rcp_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 -; UNSAFE-FP: v_mul_f32_e32 - -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -; SI-DAG: v_rcp_f32 -; SI-DAG: v_mul_f32 -define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) { +; SI: v_rcp_f32 +; SI: v_rcp_f32 +; SI: v_rcp_f32 +; SI: v_rcp_f32 +define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 %a = load <4 x float>, <4 x float> addrspace(1) * %in %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr @@ -259,3 +222,9 @@ define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> ad store <4 x float> %result, <4 x float> addrspace(1)* %out ret void } + +attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals" } +attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals" } +attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals" } + +!0 = !{float 2.500000e+00} diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll new file mode 100644 index 00000000000..54d7848da3b --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s + +declare float @llvm.amdgcn.fdiv.fast(float, float) #0 + +; CHECK-LABEL: {{^}}test_fdiv_fast: +; CHECK: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc +; CHECK: v_mul_f32_e32 +; CHECK: v_rcp_f32_e32 +; CHECK: v_mul_f32_e32 +; CHECK: v_mul_f32_e32 +define void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 { + %fdiv = call float @llvm.amdgcn.fdiv.fast(float %a, float %b) + store float %fdiv, float addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/rcp-pattern.ll b/test/CodeGen/AMDGPU/rcp-pattern.ll index b1d42206254..27a88f7b59e 100644 --- a/test/CodeGen/AMDGPU/rcp-pattern.ll +++ b/test/CodeGen/AMDGPU/rcp-pattern.ll @@ -1,11 +1,96 @@ -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; FIXME: Evergreen only ever does unsafe fp math. ; FUNC-LABEL: {{^}}rcp_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] +; GCN: buffer_store_dword [[RCP]] + ; EG: RECIP_IEEE -define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { +define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 { %rcp = fdiv float 1.0, %src store float %rcp, float addrspace(1)* %out, align 4 ret void } + +; FUNC-LABEL: {{^}}rcp_ulp25_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] +; GCN: buffer_store_dword [[RCP]] + +; EG: RECIP_IEEE +define void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { + %rcp = fdiv float 1.0, %src, !fpmath !0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rcp_fast_ulp25_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] +; GCN: buffer_store_dword [[RCP]] + +; EG: RECIP_IEEE +define void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { + %rcp = fdiv fast float 1.0, %src, !fpmath !0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rcp_arcp_ulp25_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] +; GCN: buffer_store_dword [[RCP]] + +; EG: RECIP_IEEE +define void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { + %rcp = fdiv arcp float 1.0, %src, !fpmath !0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rcp_global_fast_ulp25_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] +; GCN: buffer_store_dword [[RCP]] + +; EG: RECIP_IEEE +define void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 { + %rcp = fdiv float 1.0, %src, !fpmath !0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rcp_fabs_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], |[[SRC]]| +; GCN: buffer_store_dword [[RCP]] + +; EG: RECIP_IEEE +define void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 { + %src.fabs = call float @llvm.fabs.f32(float %src) + %rcp = fdiv float 1.0, %src.fabs + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FIXME: fneg folded into constant 1 +; FUNC-LABEL: {{^}}rcp_fabs_fneg_pat_f32: +define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 { + %src.fabs = call float @llvm.fabs.f32(float %src) + %src.fabs.fneg = fsub float -0.0, %src.fabs + %rcp = fdiv float 1.0, %src.fabs.fneg + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + + +declare float @llvm.fabs.f32(float) #1 + +attributes #0 = { nounwind "unsafe-fp-math"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind "unsafe-fp-math"="true" } + +!0 = !{float 2.500000e+00} diff --git a/test/CodeGen/AMDGPU/reciprocal.ll b/test/CodeGen/AMDGPU/reciprocal.ll deleted file mode 100644 index f9292a78852..00000000000 --- a/test/CodeGen/AMDGPU/reciprocal.ll +++ /dev/null @@ -1,13 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define amdgpu_ps void @test(<4 x float> inreg %reg0) { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = fdiv float 1.0, %r0 - %vec = insertelement <4 x float> undef, float %r1, i32 0 - call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)