[AMDGPU] gfx10 v_fmac_f16 operand folding

author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Wed, 25 Sep 2019 18:40:20 +0000 (18:40 +0000)

committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>

Wed, 25 Sep 2019 18:40:20 +0000 (18:40 +0000)
author Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Wed, 25 Sep 2019 18:40:20 +0000 (18:40 +0000)
committer Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Wed, 25 Sep 2019 18:40:20 +0000 (18:40 +0000)
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp

index e8f7fdb57fb9c829682c9a812e54535ff4857180..b2edb362b809feb28ddbe6e3dd95616f6d35cc56 100644 (file)
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -142,16 +142,20 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
    switch (Opc) {
    case AMDGPU::V_MAC_F32_e64:
    case AMDGPU::V_MAC_F16_e64:
-  case AMDGPU::V_FMAC_F32_e64: {
+  case AMDGPU::V_FMAC_F32_e64:
+  case AMDGPU::V_FMAC_F16_e64: {
      // Special case for mac. Since this is replaced with mad when folded into
      // src2, we need to check the legality for the final instruction.
      int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
      if (static_cast<int>(OpNo) == Src2Idx) {
-      bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
-      bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+      bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
+                   Opc == AMDGPU::V_FMAC_F16_e64;
+      bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
+                   Opc == AMDGPU::V_FMAC_F32_e64;
  
        unsigned Opc = IsFMA ?
-        AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+        (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
+        (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
        const MCInstrDesc &MadDesc = TII->get(Opc);
        return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
      }
@@ -314,12 +318,15 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
      // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
      unsigned Opc = MI->getOpcode();
      if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
-         Opc == AMDGPU::V_FMAC_F32_e64) &&
+         Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
          (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
-      bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
-      bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+      bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
+                   Opc == AMDGPU::V_FMAC_F16_e64;
+      bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
+                   Opc == AMDGPU::V_FMAC_F32_e64;
        unsigned NewOpc = IsFMA ?
-        AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+        (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
+        (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
  
        // Check if changing this to a v_mad_{f16, f32} instruction will allow us
        // to fold the operand.
diff --git a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll

index d18705f1dd410eedf179ed9d90120ca7c79c4b46..d8a4a0b841d017fb0884ae76369b8f496312fc68 100644 (file)
--- a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -126,7 +126,7 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, f
  ; GFX8_10:      v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
  ; VI-FLUSH:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
  ; VI-DENORM:    v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
-; GFX10-DENORM: v_fmac_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
+; GFX10-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
  ; GFX10-FLUSH:  v_sub_f16_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
  define amdgpu_kernel void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
    %x = bitcast i16 %x.arg to half
@@ -152,7 +152,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i
  ; VI-FLUSH-DAG:     v_mac_f16_e64 [[MAD:v[0-9]+]], [[X]], 2.0
  ; VI-DENORM-DAG:    v_fma_f16 [[MAD:v[0-9]+]], [[X]], 2.0, v{{[0-9]+}}
  ; GFX10-FLUSH-DAG:  v_add_f16_e32 [[MAD:v[0-9]+]], s{{[0-9]+}}, [[MUL2]]
-; GFX10-DENORM-DAG: v_fmac_f16_e64 [[MAD:v[0-9]+]], [[X]], 2.0
+; GFX10-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], [[X]], 2.0, s{{[0-9]+}}
  
  ; GCN-DAG: buffer_store_short [[MUL2]]
  ; GCN-DAG: buffer_store_short [[MAD]]
@@ -174,7 +174,7 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i
  ; VI-FLUSH-DAG:     v_mad_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}}
  ; VI-DENORM-DAG:    v_fma_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, v{{[0-9]+}}
  ; GFX10-FLUSH-DAG:  v_add_f16_e32 [[MAD:v[0-9]+]], s{{[0-9]+}}, [[MUL2]]
-; GFX10-DENORM-DAG: v_fmac_f16_e64 [[MAD:v[0-9]+]], |[[X]]|, 2.0
+; GFX10-DENORM-DAG: v_fma_f16 [[MAD:v[0-9]+]], |[[X]]|, 2.0, s{{[0-9]+}}
  
  ; GCN-DAG: buffer_store_short [[MUL2]]
  ; GCN-DAG: buffer_store_short [[MAD]]
@@ -201,8 +201,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i
  ; GFX10-FLUSH:  v_add_f16_e64 [[MUL2:v[0-9]+]], |[[X:s[0-9]+]]|, |{{s[0-9]+}}|
  ; GFX10-FLUSH:  v_add_f16_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[MUL2]]
  ; GFX10-FLUSH:  v_add_f16_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[MUL2]]
-; GFX10-DENORM: v_fmac_f16_e64 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0
-; GFX10-DENORM: v_fmac_f16_e64 {{v[0-9]+}}, |[[X]]|, 2.0
+; GFX10-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, s{{[0-9]+}}
+; GFX10-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X]]|, 2.0, s{{[0-9]+}}
  
  define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
    %x = bitcast i16 %x.arg to half
author	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Wed, 25 Sep 2019 18:40:20 +0000 (18:40 +0000)
committer	Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
	Wed, 25 Sep 2019 18:40:20 +0000 (18:40 +0000)
lib/Target/AMDGPU/SIFoldOperands.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll		patch \| blob \| history