From cd39b42cab26b48aa4d512c9e5a8ce2f1fd21052 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 22 Feb 2017 23:53:37 +0000 Subject: [PATCH] AMDGPU: Use clamp with f64 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295908 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIFoldOperands.cpp | 3 ++- lib/Target/AMDGPU/SIISelLowering.cpp | 13 ++++++++----- lib/Target/AMDGPU/SIInstructions.td | 2 +- test/CodeGen/AMDGPU/clamp-modifier.ll | 15 +++++++++++++++ test/CodeGen/AMDGPU/clamp.ll | 9 +++------ 5 files changed, 29 insertions(+), 13 deletions(-) diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index f223f57bfa3..e98182562d8 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -694,7 +694,8 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { unsigned Op = MI.getOpcode(); switch (Op) { case AMDGPU::V_MAX_F32_e64: - case AMDGPU::V_MAX_F16_e64: { + case AMDGPU::V_MAX_F16_e64: + case AMDGPU::V_MAX_F64: { if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) return nullptr; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index ca42c13a909..bf4d3fb9b4f 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4055,7 +4055,8 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, } // No med3 for f16, but clamp is possible. - if (VT == MVT::f16) + // TODO: gfx9 has med3 f16 + if (VT == MVT::f16 || VT == MVT::f64) return SDValue(); // This isn't safe with signaling NaNs because in IEEE mode, min/max on a @@ -4073,6 +4074,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); unsigned Opc = N->getOpcode(); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -4080,7 +4082,9 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) { + + if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY && + VT != MVT::f64) { // max(max(a, b), c) -> max3(a, b, c) // min(min(a, b), c) -> min3(a, b, c) if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { @@ -4122,8 +4126,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || (Opc == AMDGPUISD::FMIN_LEGACY && Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && - (N->getValueType(0) == MVT::f32 || - (N->getValueType(0) == MVT::f16 && Subtarget->has16BitInsts())) && + (VT == MVT::f32 || VT == MVT::f64 || + (VT == MVT::f16 && Subtarget->has16BitInsts())) && Op0.hasOneUse()) { if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) return Res; @@ -4404,7 +4408,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::FMIN_LEGACY: case AMDGPUISD::FMAX_LEGACY: { if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG && - N->getValueType(0) != MVT::f64 && getTargetMachine().getOptLevel() > CodeGenOpt::None) return performMinMaxCombine(N, DCI); break; diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 5d147b24ec9..1a40bd72ad3 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -657,8 +657,8 @@ class ClampPat : Pat < i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, $omod) >; -// TODO: Does f64 support clamp? def : ClampPat; +def : ClampPat; def : ClampPat; /********** ================================ **********/ diff --git a/test/CodeGen/AMDGPU/clamp-modifier.ll b/test/CodeGen/AMDGPU/clamp-modifier.ll index f5c53c64db1..186bd349ecc 100644 --- a/test/CodeGen/AMDGPU/clamp-modifier.ll +++ b/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -153,6 +153,21 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(<2 x float> addrspace(1)* %out, ret void } +; GCN-LABEL: {{^}}v_clamp_add_src_f64: +; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] +; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], 1.0 clamp{{$}} +define amdgpu_kernel void @v_clamp_add_src_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid + %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid + %a = load double, double addrspace(1)* %gep0 + %add = fadd double %a, 1.0 + %max = call double @llvm.maxnum.f64(double %add, double 0.0) + %clamp = call double @llvm.minnum.f64(double %max, double 1.0) + store double %clamp, double addrspace(1)* %out.gep + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @llvm.fabs.f32(float) #1 declare float @llvm.floor.f32(float) #1 diff --git a/test/CodeGen/AMDGPU/clamp.ll b/test/CodeGen/AMDGPU/clamp.ll index a0075066f68..6a78290f9a8 100644 --- a/test/CodeGen/AMDGPU/clamp.ll +++ b/test/CodeGen/AMDGPU/clamp.ll @@ -147,8 +147,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addr ; FIXME: Do f64 instructions support clamp? ; GCN-LABEL: {{^}}v_clamp_f64: ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] -; GCN: v_max_f64 -; GCN: v_min_f64 +; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], [[A]] clamp{{$}} define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid @@ -163,8 +162,7 @@ define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspa ; GCN-LABEL: {{^}}v_clamp_neg_f64: ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] -; GCN: v_max_f64 -; GCN: v_min_f64 +; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -[[A]], -[[A]] clamp{{$}} define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid @@ -180,8 +178,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double add ; GCN-LABEL: {{^}}v_clamp_negabs_f64: ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] -; GCN: v_max_f64 -; GCN: v_min_f64 +; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -|[[A]]|, -|[[A]]| clamp{{$}} define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid -- 2.50.1