AMDGPU: Redefine clamp node as clamp 0.0-1.0

author Matt Arsenault <Matthew.Arsenault@amd.com>

Tue, 21 Feb 2017 23:35:48 +0000 (23:35 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Tue, 21 Feb 2017 23:35:48 +0000 (23:35 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Tue, 21 Feb 2017 23:35:48 +0000 (23:35 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Tue, 21 Feb 2017 23:35:48 +0000 (23:35 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td

index 0de72734b71b985796f1c4b5bcf424fc7534d98c..17d6b460a80aeae263c556aa7a7eb81bcbf4383c 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -257,6 +257,12 @@ def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
    [FeatureFP64FP16Denormals]
  >;
  
+def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
+  "DX10Clamp",
+  "true",
+  "clamp modifier clamps NaNs to 0.0"
+>;
+
  def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
    "FPExceptions",
    "true",
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

index c3ff2e322546151a19d2ba86461ce3448aefd51f..577509d31ce94073252ffa6d803da737c222bd3e 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -586,7 +586,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
    ProgInfo.IEEEMode = STM.enableIEEEBit(MF);
  
    // Make clamp modifier on NaN input returns 0.
-  ProgInfo.DX10Clamp = 1;
+  ProgInfo.DX10Clamp = STM.enableDX10Clamp();
  
    const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
    ProgInfo.ScratchSize = FrameInfo.getStackSize();
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

index 12c17c3e9eb565b9e0fc1dc218c0de81c2861b31..9e3ac8b871e6340c8e24e0b941d598804a739d94 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1012,22 +1012,29 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
    EVT VT = Op.getValueType();
  
    switch (IntrinsicID) {
-    default: return Op;
-    case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name.
-      return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  default: return Op;
+  case AMDGPUIntrinsic::AMDGPU_clamp: {
+    // Deprecated in favor of emitting min/max combo or fmed3.
+    ConstantFPSDNode *CSrc1 = dyn_cast<ConstantFPSDNode>(Op.getOperand(2));
+    ConstantFPSDNode *CSrc2 = dyn_cast<ConstantFPSDNode>(Op.getOperand(3));
+    if (CSrc1 && CSrc2 && CSrc1->isZero() && CSrc2->isExactlyValue(1.0))
+      return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, Op.getOperand(1));
  
-    case AMDGPUIntrinsic::AMDGPU_bfe_i32:
-      return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
-                         Op.getOperand(1),
-                         Op.getOperand(2),
-                         Op.getOperand(3));
+    SDValue Max = DAG.getNode(ISD::FMAXNUM, DL, VT, Op.getOperand(1),
+                              Op.getOperand(2));
+    return DAG.getNode(ISD::FMINNUM, DL, VT, Max, Op.getOperand(3));
+  }
+  case AMDGPUIntrinsic::AMDGPU_bfe_i32:
+    return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
+                       Op.getOperand(1),
+                       Op.getOperand(2),
+                       Op.getOperand(3));
  
-    case AMDGPUIntrinsic::AMDGPU_bfe_u32:
-      return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
-                         Op.getOperand(1),
-                         Op.getOperand(2),
-                         Op.getOperand(3));
+  case AMDGPUIntrinsic::AMDGPU_bfe_u32:
+    return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
+                       Op.getOperand(1),
+                       Op.getOperand(2),
+                       Op.getOperand(3));
    }
  }
  
@@ -2445,6 +2452,28 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
                        SN->getBasePtr(), SN->getMemOperand());
  }
  
+SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+  if (!CSrc)
+    return SDValue();
+
+  const APFloat &F = CSrc->getValueAPF();
+  APFloat Zero = APFloat::getZero(F.getSemantics());
+  APFloat::cmpResult Cmp0 = F.compare(Zero);
+  if (Cmp0 == APFloat::cmpLessThan ||
+      (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
+    return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
+  }
+
+  APFloat One(F.getSemantics(), "1.0");
+  APFloat::cmpResult Cmp1 = F.compare(One);
+  if (Cmp1 == APFloat::cmpGreaterThan)
+    return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
+
+  return SDValue(CSrc, 0);
+}
+
  /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
  /// binary operation \p Opc to it with the corresponding constant operands.
  SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
@@ -3323,6 +3352,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
      return performLoadCombine(N, DCI);
    case ISD::STORE:
      return performStoreCombine(N, DCI);
+  case AMDGPUISD::CLAMP:
+    return performClampCombine(N, DCI);
    }
    return SDValue();
  }
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h

index 58ac09f2698a1bfdb95e2f2f7c61210d1365f1fa..fb487f914ad92497791cbc2c2ef1da2ac8bd9787 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -70,6 +70,7 @@ protected:
    bool shouldCombineMemoryType(EVT VT) const;
    SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
    SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  
    SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
                                         unsigned Opc, SDValue LHS,
@@ -238,7 +239,11 @@ enum NodeType : unsigned {
    RETURN,
    DWORDADDR,
    FRACT,
+
+  /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output
+  /// modifier behavior with dx10_enable.
    CLAMP,
+
    // This is SETCC with the full mask result which is used for a compare with a
    // result bit per item in the wavefront.
    SETCC,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td

index 91341ef88ea695c9ce9bf2d86709fbd0f3bb44a0..a081c30ef815d676978d3d5ddef8fe046045225b 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -92,7 +92,7 @@ def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp,
    [SDNPCommutative, SDNPAssociative]
  >;
  
-def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
+def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
  
  // out = min(a, b) a and b are floats, where a nan comparison fails.
  def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td

index 7d2a52bccc8a4faa987bf32a3094e8a67597ef91..e76891c3bed09ea2bc320522bf272bb4ef30118b 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -452,7 +452,7 @@ class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
    (outs rc:$dst),
    (ins rc:$src0),
    "CLAMP $dst, $src0",
-  [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
+  [(set f32:$dst, (AMDGPUclamp f32:$src0))]
  >;
  
  class FABS <RegisterClass rc> : AMDGPUShaderInst <
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

index 32b04fe7a1e162417e8b1f41df39d20595d68662..a719931eff0b3443ff23f2767cfbd4e199a4fac7 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -42,7 +42,7 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
    // for SI has the unhelpful behavior that it unsets everything else if you
    // disable it.
  
-  SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+load-store-opt,");
+  SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
    if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
      FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
  
@@ -89,6 +89,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
      FP32Denormals(false),
      FP64FP16Denormals(false),
      FPExceptions(false),
+    DX10Clamp(false),
      FlatForGlobal(false),
      UnalignedScratchAccess(false),
      UnalignedBufferAccess(false),
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h

index 433c295feaf88bf5480595b0bc27d4b232692bdb..8f1aaa1d8939c0a4b8755cc6285425b469c3a7db 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -103,6 +103,7 @@ protected:
    bool FP32Denormals;
    bool FP64FP16Denormals;
    bool FPExceptions;
+  bool DX10Clamp;
    bool FlatForGlobal;
    bool UnalignedScratchAccess;
    bool UnalignedBufferAccess;
@@ -294,10 +295,6 @@ public:
      return DumpCode;
    }
  
-  bool enableIEEEBit(const MachineFunction &MF) const {
-    return AMDGPU::isCompute(MF.getFunction()->getCallingConv());
-  }
-
    /// Return the amount of LDS that can be used that will not restrict the
    /// occupancy lower than WaveCount.
    unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
@@ -323,6 +320,14 @@ public:
      return FPExceptions;
    }
  
+  bool enableDX10Clamp() const {
+    return DX10Clamp;
+  }
+
+  bool enableIEEEBit(const MachineFunction &MF) const {
+    return AMDGPU::isCompute(MF.getFunction()->getCallingConv());
+  }
+
    bool useFlatForGlobal() const {
      return FlatForGlobal;
    }
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index 60acf4fb1c011824a7cca72b60efd3575570e02a..40a28203000603189087a2f2527f71f48bd93c57 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3995,8 +3995,10 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
    return DAG.isKnownNeverNaN(Op);
  }
  
-static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
-                                       SDValue Op0, SDValue Op1) {
+SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
+                                                  const SDLoc &SL,
+                                                  SDValue Op0,
+                                                  SDValue Op1) const {
    ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
    if (!K1)
      return SDValue();
@@ -4010,6 +4012,20 @@ static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
    if (Cmp == APFloat::cmpGreaterThan)
      return SDValue();
  
+  // TODO: Check IEEE bit enabled?
+  EVT VT = K0->getValueType(0);
+  if (Subtarget->enableDX10Clamp()) {
+    // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
+    // hardware fmed3 behavior converting to a min.
+    // FIXME: Should this be allowing -0.0?
+    if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
+      return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
+  }
+
+  // No med3 for f16, but clamp is possible.
+  if (VT == MVT::f16)
+    return SDValue();
+
    // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
    // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
    // give the other result, which is different from med3 with a NaN input.
@@ -4074,7 +4090,9 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
    if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
         (Opc == AMDGPUISD::FMIN_LEGACY &&
          Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
-      N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
+      (N->getValueType(0) == MVT::f32 ||
+       (N->getValueType(0) == MVT::f16 && Subtarget->has16BitInsts())) &&
+      Op0.hasOneUse()) {
      if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
        return Res;
    }
@@ -4082,6 +4100,60 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
    return SDValue();
  }
  
+static bool isClampZeroToOne(SDValue A, SDValue B) {
+  if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
+    if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
+      // FIXME: Should this be allowing -0.0?
+      return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
+             (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
+    }
+  }
+
+  return false;
+}
+
+// FIXME: Should only worry about snans for version with chain.
+SDValue SITargetLowering::performFMed3Combine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  EVT VT = N->getValueType(0);
+  // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
+  // NaNs. With a NaN input, the order of the operands may change the result.
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+
+  SDValue Src0 = N->getOperand(0);
+  SDValue Src1 = N->getOperand(1);
+  SDValue Src2 = N->getOperand(2);
+
+  if (isClampZeroToOne(Src0, Src1)) {
+    // const_a, const_b, x -> clamp is safe in all cases including signaling
+    // nans.
+    // FIXME: Should this be allowing -0.0?
+    return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
+  }
+
+  // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
+  // handling no dx10-clamp?
+  if (Subtarget->enableDX10Clamp()) {
+    // If NaNs is clamped to 0, we are free to reorder the inputs.
+
+    if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
+      std::swap(Src0, Src1);
+
+    if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
+      std::swap(Src1, Src2);
+
+    if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
+      std::swap(Src0, Src1);
+
+    if (isClampZeroToOne(Src1, Src2))
+      return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
+  }
+
+  return SDValue();
+}
+
  unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
                                            const SDNode *N0,
                                            const SDNode *N1) const {
@@ -4348,6 +4420,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
    case AMDGPUISD::CVT_F32_UBYTE2:
    case AMDGPUISD::CVT_F32_UBYTE3:
      return performCvtF32UByteNCombine(N, DCI);
+  case AMDGPUISD::FMED3:
+    return performFMed3Combine(N, DCI);
    }
    return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
  }
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h

index 037b6f730c5bf8cb9029c13b111923ef1ccd661a..fe86f8e2b65587ec27249b48ad803e2aa39ee815 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -84,7 +84,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
    SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
    SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  
+  SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+                                  SDValue Op0, SDValue Op1) const;
    SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
  
    unsigned getFusedOpcode(const SelectionDAG &DAG,
                            const SDNode *N0, const SDNode *N1) const;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td

index 94580543afde18016e1afbcb4330120d3578979d..aa6d677f466c6dfc1d16dc65cfe799a1423de7ff 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -625,6 +625,7 @@ def SRCMODS {
  
  def DSTCLAMP {
    int NONE = 0;
+  int ENABLE = 1;
  }
  
  def DSTOMOD {
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td

index 1d09880147d8a20c585727d27a911b1890b21812..4bd759012a395b90eb8401b1a7748cbe8ec2bff5 100644 (file)
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -647,12 +647,20 @@ def : BitConvert <v16f32, v16i32, VReg_512>;
  /********** Src & Dst modifiers **********/
  /********** =================== **********/
  
-def : Pat <
-  (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
-               (f32 FP_ZERO), (f32 FP_ONE)),
-  (V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod)
+
+// If denormals are not enabled, it only impacts the compare of the
+// inputs. The output result is not flushed.
+class ClampPat<Instruction inst, ValueType vt> : Pat <
+  (vt (AMDGPUclamp
+        (VOP3Mods0Clamp vt:$src0, i32:$src0_modifiers, i32:$omod))),
+  (inst i32:$src0_modifiers, vt:$src0,
+        i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, $omod)
  >;
  
+// TODO: Does f64 support clamp?
+def : ClampPat<V_MAX_F32_e64, f32>;
+def : ClampPat<V_MAX_F16_e64, f16>;
+
  /********** ================================ **********/
  /********** Floating point absolute/negative **********/
  /********** ================================ **********/
diff --git a/test/CodeGen/AMDGPU/clamp.ll b/test/CodeGen/AMDGPU/clamp.ll

new file mode 100644 (file)

index 0000000..5f9884e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/clamp.ll
@@ -0,0 +1,535 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}v_clamp_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_neg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %fneg.a = fsub float -0.0, %a
+  %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_negabs_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
+define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %fabs.a = call float @llvm.fabs.f32(float %a)
+  %fneg.fabs.a = fsub float -0.0, %fabs.a
+
+  %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_negzero_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[SIGNBIT]], 1.0
+define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float -0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
+define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  store volatile float %max, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_f16:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; VI: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+
+; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
+; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}}
+; SI: v_cvt_f16_f32_e32
+define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
+  %a = load half, half addrspace(1)* %gep0
+  %max = call half @llvm.maxnum.f16(half %a, half 0.0)
+  %med = call half @llvm.minnum.f16(half %max, half 1.0)
+
+  store half %med, half addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_neg_f16:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; VI: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
+
+; FIXME: Better to fold neg into max
+; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]]
+; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}}
+; SI: v_cvt_f16_f32
+define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
+  %a = load half, half addrspace(1)* %gep0
+  %fneg.a = fsub half -0.0, %a
+  %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0)
+  %med = call half @llvm.minnum.f16(half %max, half 1.0)
+
+  store half %med, half addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_negabs_f16:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; VI: v_max_f16_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
+
+; FIXME: Better to fold neg/abs into max
+
+; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]|
+; SI: v_max_f32_e64 v{{[0-9]+}}, [[CVT]], [[CVT]] clamp{{$}}
+; SI: v_cvt_f16_f32_e32
+define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
+  %a = load half, half addrspace(1)* %gep0
+  %fabs.a = call half @llvm.fabs.f16(half %a)
+  %fneg.fabs.a = fsub half -0.0, %fabs.a
+
+  %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0)
+  %med = call half @llvm.minnum.f16(half %max, half 1.0)
+
+  store half %med, half addrspace(1)* %out.gep
+  ret void
+}
+
+; FIXME: Do f64 instructions support clamp?
+; GCN-LABEL: {{^}}v_clamp_f64:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_max_f64
+; GCN: v_min_f64
+define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %a = load double, double addrspace(1)* %gep0
+  %max = call double @llvm.maxnum.f64(double %a, double 0.0)
+  %med = call double @llvm.minnum.f64(double %max, double 1.0)
+
+  store double %med, double addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_neg_f64:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_max_f64
+; GCN: v_min_f64
+define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %a = load double, double addrspace(1)* %gep0
+  %fneg.a = fsub double -0.0, %a
+  %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0)
+  %med = call double @llvm.minnum.f64(double %max, double 1.0)
+
+  store double %med, double addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_negabs_f64:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_max_f64
+; GCN: v_min_f64
+define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %a = load double, double addrspace(1)* %gep0
+  %fabs.a = call double @llvm.fabs.f64(double %a)
+  %fneg.fabs.a = fsub double -0.0, %fabs.a
+
+  %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0)
+  %med = call double @llvm.minnum.f64(double %max, double 1.0)
+
+  store double %med, double addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_aby_negzero_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32
+define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_aby_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_bay_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_yab_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_yba_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_bya_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constants_to_one_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 1.0
+define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constants_to_zero_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_preserve_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0.5
+define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_preserve_denorm_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fffff{{$}}
+define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float))
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_snan_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; ---------------------------------------------------------------------
+; Test non-default behaviors enabling snans and disabling dx10_clamp
+; ---------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
+define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_f32_snan_dx10clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
+define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
+define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %add  = fadd nnan float %a, 1.0
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_aby_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_bay_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_yab_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
+define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_yba_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 1.0, 0
+define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0
+define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_bya_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, [[A]], 0
+define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32_no_dx10_clamp:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fc00000
+define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_snan_f32_no_dx10_clamp:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7f800001
+define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
+declare double @llvm.fabs.f64(double) #1
+declare double @llvm.minnum.f64(double, double) #1
+declare double @llvm.maxnum.f64(double, double) #1
+declare half @llvm.fabs.f16(half) #1
+declare half @llvm.minnum.f16(half, half) #1
+declare half @llvm.maxnum.f16(half, half) #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "target-features"="-dx10-clamp,-fp-exceptions" "no-nans-fp-math"="false" }
+attributes #3 = { nounwind "target-features"="+dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
+attributes #4 = { nounwind "target-features"="-dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
diff --git a/test/CodeGen/AMDGPU/hsa-fp-mode.ll b/test/CodeGen/AMDGPU/hsa-fp-mode.ll

index 977667cbe6c3148046bee7d2aac711420e66856a..ecb825a2f20c1765f16f55b3fe06695db82239da 100644 (file)
--- a/test/CodeGen/AMDGPU/hsa-fp-mode.ll
+++ b/test/CodeGen/AMDGPU/hsa-fp-mode.ll
@@ -60,9 +60,20 @@ define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %
    ret void
  }
  
+; GCN-LABEL: {{^}}test_no_dx10_clamp_vi:
+; GCN: float_mode = 192
+; GCN: enable_dx10_clamp = 0
+; GCN: enable_ieee_mode = 1
+define void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
+  store float 0.0, float addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
+
  attributes #0 = { nounwind "target-cpu"="kaveri" }
  attributes #1 = { nounwind "target-cpu"="fiji" }
  attributes #2 = { nounwind "target-features"="-fp32-denormals,+fp64-fp16-denormals" }
  attributes #3 = { nounwind "target-features"="+fp32-denormals,-fp64-fp16-denormals" }
  attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" }
  attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" }
+attributes #6 = { nounwind "target-cpu"="fiji" "target-features"="-dx10-clamp" }
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll

index 2336109f4dadf1ea9a2783845c42b8083b4dba93..1925a0b6fe4a08290134770e0f949ba375cda913 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
@@ -7,7 +7,7 @@ declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone
  
  ; FUNC-LABEL: {{^}}clamp_0_1_f32:
  ; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], [[ARG]], 0 clamp{{$}}
+; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], [[ARG]], [[ARG]] clamp{{$}}
  ; SI: buffer_store_dword [[RESULT]]
  ; SI: s_endpgm
  
@@ -20,7 +20,7 @@ define void @clamp_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
  
  ; FUNC-LABEL: {{^}}clamp_fabs_0_1_f32:
  ; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], |[[ARG]]|, 0 clamp{{$}}
+; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], |[[ARG]]|, |[[ARG]]| clamp{{$}}
  ; SI: buffer_store_dword [[RESULT]]
  ; SI: s_endpgm
  define void @clamp_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
@@ -32,7 +32,7 @@ define void @clamp_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
  
  ; FUNC-LABEL: {{^}}clamp_fneg_0_1_f32:
  ; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], -[[ARG]], 0 clamp{{$}}
+; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[ARG]], -[[ARG]] clamp{{$}}
  ; SI: buffer_store_dword [[RESULT]]
  ; SI: s_endpgm
  define void @clamp_fneg_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
@@ -44,7 +44,7 @@ define void @clamp_fneg_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
  
  ; FUNC-LABEL: {{^}}clamp_fneg_fabs_0_1_f32:
  ; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], -|[[ARG]]|, 0 clamp{{$}}
+; SI: v_max_f32_e64 [[RESULT:v[0-9]+]], -|[[ARG]]|, -|[[ARG]]| clamp{{$}}
  ; SI: buffer_store_dword [[RESULT]]
  ; SI: s_endpgm
  define void @clamp_fneg_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Tue, 21 Feb 2017 23:35:48 +0000 (23:35 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Tue, 21 Feb 2017 23:35:48 +0000 (23:35 +0000)
lib/Target/AMDGPU/AMDGPU.td		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUISelLowering.h		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUInstrInfo.td		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUInstructions.td		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUSubtarget.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUSubtarget.h		patch \| blob \| history
lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIISelLowering.h		patch \| blob \| history
lib/Target/AMDGPU/SIInstrInfo.td		patch \| blob \| history
lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
test/CodeGen/AMDGPU/clamp.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/AMDGPU/hsa-fp-mode.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll		patch \| blob \| history