AMDGPU: Use clamp with f64

author Matt Arsenault <Matthew.Arsenault@amd.com>

Wed, 22 Feb 2017 23:53:37 +0000 (23:53 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Wed, 22 Feb 2017 23:53:37 +0000 (23:53 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Wed, 22 Feb 2017 23:53:37 +0000 (23:53 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Wed, 22 Feb 2017 23:53:37 +0000 (23:53 +0000)
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp

index f223f57bfa32fb5ef0a2031272e703415d72b8e0..e98182562d8a6474d59095a8025c2db127141844 100644 (file)
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -694,7 +694,8 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
    unsigned Op = MI.getOpcode();
    switch (Op) {
    case AMDGPU::V_MAX_F32_e64:
-  case AMDGPU::V_MAX_F16_e64: {
+  case AMDGPU::V_MAX_F16_e64:
+  case AMDGPU::V_MAX_F64: {
      if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
        return nullptr;
  
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index ca42c13a9098930f53d036692c3e4380ed643c8f..bf4d3fb9b4facc1ca660b2865a965216de056cd2 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4055,7 +4055,8 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
    }
  
    // No med3 for f16, but clamp is possible.
-  if (VT == MVT::f16)
+  // TODO: gfx9 has med3 f16
+  if (VT == MVT::f16 || VT == MVT::f64)
      return SDValue();
  
    // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
@@ -4073,6 +4074,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
    SelectionDAG &DAG = DCI.DAG;
  
+  EVT VT = N->getValueType(0);
    unsigned Opc = N->getOpcode();
    SDValue Op0 = N->getOperand(0);
    SDValue Op1 = N->getOperand(1);
@@ -4080,7 +4082,9 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
    // Only do this if the inner op has one use since this will just increases
    // register pressure for no benefit.
  
-  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) {
+
+  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
+      VT != MVT::f64) {
      // max(max(a, b), c) -> max3(a, b, c)
      // min(min(a, b), c) -> min3(a, b, c)
      if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
@@ -4122,8 +4126,8 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
    if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
         (Opc == AMDGPUISD::FMIN_LEGACY &&
          Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
-      (N->getValueType(0) == MVT::f32 ||
-       (N->getValueType(0) == MVT::f16 && Subtarget->has16BitInsts())) &&
+      (VT == MVT::f32 || VT == MVT::f64 ||
+       (VT == MVT::f16 && Subtarget->has16BitInsts())) &&
        Op0.hasOneUse()) {
      if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
        return Res;
@@ -4404,7 +4408,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
    case AMDGPUISD::FMIN_LEGACY:
    case AMDGPUISD::FMAX_LEGACY: {
      if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
-        N->getValueType(0) != MVT::f64 &&
          getTargetMachine().getOptLevel() > CodeGenOpt::None)
        return performMinMaxCombine(N, DCI);
      break;
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td

index 5d147b24ec9f887156b2f3cef3f9588e6122c3c8..1a40bd72ad321047190a35f7dfa0f20acae88fc3 100644 (file)
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -657,8 +657,8 @@ class ClampPat<Instruction inst, ValueType vt> : Pat <
          i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, $omod)
  >;
  
-// TODO: Does f64 support clamp?
  def : ClampPat<V_MAX_F32_e64, f32>;
+def : ClampPat<V_MAX_F64, f64>;
  def : ClampPat<V_MAX_F16_e64, f16>;
  
  /********** ================================ **********/
diff --git a/test/CodeGen/AMDGPU/clamp-modifier.ll b/test/CodeGen/AMDGPU/clamp-modifier.ll

index f5c53c64db11e2b71b078e7c8215241ffd5cd157..186bd349ecc44ec316277d442929e0096569d9dc 100644 (file)
--- a/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -153,6 +153,21 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(<2 x float> addrspace(1)* %out,
    ret void
  }
  
+; GCN-LABEL: {{^}}v_clamp_add_src_f64:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], 1.0 clamp{{$}}
+define amdgpu_kernel void @v_clamp_add_src_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %a = load double, double addrspace(1)* %gep0
+  %add = fadd double %a, 1.0
+  %max = call double @llvm.maxnum.f64(double %add, double 0.0)
+  %clamp = call double @llvm.minnum.f64(double %max, double 1.0)
+  store double %clamp, double addrspace(1)* %out.gep
+  ret void
+}
+
  declare i32 @llvm.amdgcn.workitem.id.x() #1
  declare float @llvm.fabs.f32(float) #1
  declare float @llvm.floor.f32(float) #1
diff --git a/test/CodeGen/AMDGPU/clamp.ll b/test/CodeGen/AMDGPU/clamp.ll

index a0075066f68f6f6db4bc6731d5a9a264e7fe4b4a..6a78290f9a82d91bd5d25dfb12b629241e20e26b 100644 (file)
--- a/test/CodeGen/AMDGPU/clamp.ll
+++ b/test/CodeGen/AMDGPU/clamp.ll
@@ -147,8 +147,7 @@ define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addr
  ; FIXME: Do f64 instructions support clamp?
  ; GCN-LABEL: {{^}}v_clamp_f64:
  ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
-; GCN: v_max_f64
-; GCN: v_min_f64
+; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], [[A]] clamp{{$}}
  define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
    %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
@@ -163,8 +162,7 @@ define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspa
  
  ; GCN-LABEL: {{^}}v_clamp_neg_f64:
  ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
-; GCN: v_max_f64
-; GCN: v_min_f64
+; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -[[A]], -[[A]] clamp{{$}}
  define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
    %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
@@ -180,8 +178,7 @@ define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double add
  
  ; GCN-LABEL: {{^}}v_clamp_negabs_f64:
  ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
-; GCN: v_max_f64
-; GCN: v_min_f64
+; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -|[[A]]|, -|[[A]]| clamp{{$}}
  define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
    %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Wed, 22 Feb 2017 23:53:37 +0000 (23:53 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Wed, 22 Feb 2017 23:53:37 +0000 (23:53 +0000)
lib/Target/AMDGPU/SIFoldOperands.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
test/CodeGen/AMDGPU/clamp-modifier.ll		patch \| blob \| history
test/CodeGen/AMDGPU/clamp.ll		patch \| blob \| history