AMDGPU: Skip fneg/select combine if it can fold into other

author Matt Arsenault <Matthew.Arsenault@amd.com>

Thu, 12 Jan 2017 18:58:15 +0000 (18:58 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Thu, 12 Jan 2017 18:58:15 +0000 (18:58 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Thu, 12 Jan 2017 18:58:15 +0000 (18:58 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Thu, 12 Jan 2017 18:58:15 +0000 (18:58 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

index 7771b23e43a463ea8ca6a9bf19250d8d5cd017b8..e48c1943cb01641ce2900e0d55b587706586fc5a 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -484,6 +484,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
  // Target Information
  //===----------------------------------------------------------------------===//
  
+static bool fnegFoldsIntoOp(unsigned Opc) {
+  switch (Opc) {
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FMA:
+  case ISD::FMAD:
+  case ISD::FSIN:
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RCP_LEGACY:
+  case AMDGPUISD::SIN_HW:
+  case AMDGPUISD::FMUL_LEGACY:
+    return true;
+  default:
+    return false;
+  }
+}
+
  MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
    return MVT::i32;
  }
@@ -2738,20 +2756,31 @@ static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
      SDValue NewLHS = LHS.getOperand(0);
      SDValue NewRHS = RHS;
  
-    // TODO: Skip for operations where other combines can absord the fneg.
+    // Careful: if the neg can be folded up, don't try to pull it back down.
+    bool ShouldFoldNeg = true;
  
-    if (LHS.getOpcode() == ISD::FNEG)
-      NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
-    else if (CRHS->isNegative())
-      return SDValue();
+    if (NewLHS.hasOneUse()) {
+      unsigned Opc = NewLHS.getOpcode();
+      if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
+        ShouldFoldNeg = false;
+      if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
+        ShouldFoldNeg = false;
+    }
  
-    if (Inv)
-      std::swap(NewLHS, NewRHS);
+    if (ShouldFoldNeg) {
+      if (LHS.getOpcode() == ISD::FNEG)
+        NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+      else if (CRHS->isNegative())
+        return SDValue();
  
-    SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
-                                    Cond, NewLHS, NewRHS);
-    DCI.AddToWorklist(NewSelect.getNode());
-    return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
+      if (Inv)
+        std::swap(NewLHS, NewRHS);
+
+      SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
+                                      Cond, NewLHS, NewRHS);
+      DCI.AddToWorklist(NewSelect.getNode());
+      return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
+    }
    }
  
    return SDValue();
@@ -2806,24 +2835,6 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
    return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
  }
  
-static bool fnegFoldsIntoOp(unsigned Opc) {
-  switch (Opc) {
-  case ISD::FADD:
-  case ISD::FSUB:
-  case ISD::FMUL:
-  case ISD::FMA:
-  case ISD::FMAD:
-  case ISD::FSIN:
-  case AMDGPUISD::RCP:
-  case AMDGPUISD::RCP_LEGACY:
-  case AMDGPUISD::SIN_HW:
-  case AMDGPUISD::FMUL_LEGACY:
-    return true;
-  default:
-    return false;
-  }
-}
-
  SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
                                                   DAGCombinerInfo &DCI) const {
    SelectionDAG &DAG = DCI.DAG;
diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll

new file mode 100644 (file)

index 0000000..559d464
--- /dev/null
+++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+
+; --------------------------------------------------------------------------------
+; Don't fold if fneg can fold into the source
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_legacy_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_rcp_legacy_f32_e32 [[RCP:v[0-9]+]], [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
+; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
+; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+define void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %rcp = call float @llvm.amdgcn.rcp.legacy(float %x)
+  %fneg = fsub float -0.0, %rcp
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_mul_legacy_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[MUL]], vcc
+; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
+; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+define void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %mul = call float @llvm.amdgcn.fmul.legacy(float %x, float 4.0)
+  %fneg = fsub float -0.0, %mul
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+declare float @llvm.amdgcn.rcp.legacy(float) #1
+declare float @llvm.amdgcn.fmul.legacy(float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll

index a260a00e7e39fdbd8106c22b2ae37918ab0d2475..d9d311cd032b1f1d0f38663a15494693ae6e248f 100644 (file)
--- a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
+++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
@@ -721,7 +721,120 @@ define void @mul_select_negk_negfabs_f32(i32 %c) #0 {
    ret void
  }
  
+; --------------------------------------------------------------------------------
+; Don't fold if fneg can fold into the source
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_add_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Y:v[0-9]+]]
+
+; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], -4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_add_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %add = fadd float %x, 4.0
+  %fneg = fsub float -0.0, %add
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_sub_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], 4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_sub_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %add = fsub float %x, 4.0
+  %fneg = fsub float -0.0, %add
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_mul_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[MUL]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_mul_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %mul = fmul float %x, 4.0
+  %fneg = fsub float -0.0, %mul
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_fma_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[X]], -4.0, -[[Z]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[FMA]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_fma_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fma = call float @llvm.fma.f32(float %x, float 4.0, float %z)
+  %fneg = fsub float -0.0, %fma
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_fneg_posk_src_fmad_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+; GCN: buffer_load_dword [[Z:v[0-9]+]]
+
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[X]], vcc
+; GCN-NEXT: buffer_store_dword [[SELECT]]
+define void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %z = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %fmad = call float @llvm.fmuladd.f32(float %x, float 4.0, float %z)
+  %fneg = fsub float -0.0, %fmad
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
+; FIXME: This one should fold to rcp
+; GCN-LABEL: {{^}}select_fneg_posk_src_rcp_f32:
+; GCN: buffer_load_dword [[X:v[0-9]+]]
+
+; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[X]]
+; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
+; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
+; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
+define void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
+  %x = load volatile float, float addrspace(1)* undef
+  %y = load volatile float, float addrspace(1)* undef
+  %cmp = icmp eq i32 %c, 0
+  %rcp = call float @llvm.amdgcn.rcp.f32(float %x)
+  %fneg = fsub float -0.0, %rcp
+  %select = select i1 %cmp, float %fneg, float 2.0
+  store volatile float %select, float addrspace(1)* undef
+  ret void
+}
+
  declare float @llvm.fabs.f32(float) #1
+declare float @llvm.fma.f32(float, float, float) #1
+declare float @llvm.fmuladd.f32(float, float, float) #1
+declare float @llvm.amdgcn.rcp.f32(float) #1
+declare float @llvm.amdgcn.rcp.legacy(float) #1
+declare float @llvm.amdgcn.fmul.legacy(float, float) #1
  
  attributes #0 = { nounwind }
  attributes #1 = { nounwind readnone }
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Thu, 12 Jan 2017 18:58:15 +0000 (18:58 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Thu, 12 Jan 2017 18:58:15 +0000 (18:58 +0000)
lib/Target/AMDGPU/AMDGPUISelLowering.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll		patch \| blob \| history