R600/SI: Commute instructions to enable more folding opportunities

author Tom Stellard <thomas.stellard@amd.com>

Wed, 7 Jan 2015 22:44:19 +0000 (22:44 +0000)

committer Tom Stellard <thomas.stellard@amd.com>

Wed, 7 Jan 2015 22:44:19 +0000 (22:44 +0000)
author Tom Stellard <thomas.stellard@amd.com>
Wed, 7 Jan 2015 22:44:19 +0000 (22:44 +0000)
committer Tom Stellard <thomas.stellard@amd.com>
Wed, 7 Jan 2015 22:44:19 +0000 (22:44 +0000)
diff --git a/lib/Target/R600/SIFoldOperands.cpp b/lib/Target/R600/SIFoldOperands.cpp

index 545905ba64e1c2d3b305c23e7bd307fcda25cd35..655b3aaaa2b939f04e0ce67f8da8902ff3a04219 100644 (file)
--- a/lib/Target/R600/SIFoldOperands.cpp
+++ b/lib/Target/R600/SIFoldOperands.cpp
@@ -56,10 +56,16 @@ struct FoldCandidate {
    uint64_t ImmToFold;
  
    FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp) :
-      UseMI(MI), UseOpNo(OpNo), OpToFold(FoldOp), ImmToFold(0) { }
-
-  FoldCandidate(MachineInstr *MI, unsigned OpNo, uint64_t Imm) :
-      UseMI(MI), UseOpNo(OpNo), OpToFold(nullptr), ImmToFold(Imm) { }
+                UseMI(MI), UseOpNo(OpNo) {
+
+    if (FoldOp->isImm()) {
+      OpToFold = nullptr;
+      ImmToFold = FoldOp->getImm();
+    } else {
+      assert(FoldOp->isReg());
+      OpToFold = FoldOp;
+    }
+  }
  
    bool isImm() const {
      return !OpToFold;
@@ -119,6 +125,35 @@ static bool updateOperand(FoldCandidate &Fold,
    return false;
  }
  
+static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
+                             MachineInstr *MI, unsigned OpNo,
+                             MachineOperand *OpToFold,
+                             const SIInstrInfo *TII) {
+  if (!TII->isOperandLegal(MI, OpNo, OpToFold)) {
+    // Operand is not legal, so try to commute the instruction to
+    // see if this makes it possible to fold.
+    unsigned CommuteIdx0;
+    unsigned CommuteIdx1;
+    bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1);
+
+    if (CanCommute) {
+      if (CommuteIdx0 == OpNo)
+        OpNo = CommuteIdx1;
+      else if (CommuteIdx1 == OpNo)
+        OpNo = CommuteIdx0;
+    }
+
+    if (!CanCommute || !TII->commuteInstruction(MI))
+      return false;
+
+    if (!TII->isOperandLegal(MI, OpNo, OpToFold))
+      return false;
+  }
+
+  FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+  return true;
+}
+
  bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
    MachineRegisterInfo &MRI = MF.getRegInfo();
    const SIInstrInfo *TII =
@@ -140,6 +175,11 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
        MachineOperand &OpToFold = MI.getOperand(1);
        bool FoldingImm = OpToFold.isImm() || OpToFold.isFPImm();
  
+      // FIXME: We could also be folding things like FrameIndexes and
+      // TargetIndexes.
+      if (!FoldingImm && !OpToFold.isReg())
+        continue;
+
        // Folding immediates with more than one use will increase program side.
        // FIXME: This will also reduce register usage, which may be better
        // in some cases.  A better heuristic is needed.
@@ -210,24 +250,13 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
              UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1)
            continue;
  
-
          if (FoldingImm) {
-          const MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
-          if (TII->isOperandLegal(UseMI, Use.getOperandNo(), &ImmOp)) {
-            FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(),
-                               Imm.getSExtValue()));
-          }
-          continue;
-        }
-
-        // Normal substitution with registers
-        if (TII->isOperandLegal(UseMI, Use.getOperandNo(), &OpToFold)) {
-          FoldList.push_back(FoldCandidate(UseMI, Use.getOperandNo(), &OpToFold));
+          MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
+          tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII);
            continue;
          }
  
-        // FIXME: We could commute the instruction to create more opportunites
-        // for folding.  This will only be useful if we have 32-bit instructions.
+        tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII);
  
          // FIXME: We could try to change the instruction from 64-bit to 32-bit
          // to enable more folding opportunites.  The shrink operands pass
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp

index 37e64e930651e1d917ab9580dc34a6c4fb774d08..743d1c65815df9a9e7f651d887d984397d1c571e 100644 (file)
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -709,6 +709,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
  
  MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
                                                bool NewMI) const {
+
    if (MI->getNumOperands() < 3)
      return nullptr;
  
@@ -730,8 +731,9 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
    // Make sure it's legal to commute operands for VOP2.
    if (isVOP2(MI->getOpcode()) &&
        (!isOperandLegal(MI, Src0Idx, &Src1) ||
-       !isOperandLegal(MI, Src1Idx, &Src0)))
+       !isOperandLegal(MI, Src1Idx, &Src0))) {
      return nullptr;
+    }
  
    if (!Src1.isReg()) {
      // Allow commuting instructions with Imm or FPImm operands.
@@ -1471,6 +1473,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
      //
      // s_sendmsg 0, s0 ; Operand defined as m0reg
      //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
+
      return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
    }
  
diff --git a/test/CodeGen/R600/mulhu.ll b/test/CodeGen/R600/mulhu.ll

index 146c92f8905be691adcb8ce9c23983dd4b440bb6..865971712306671dffa346a64565ad4f02c5192d 100644 (file)
--- a/test/CodeGen/R600/mulhu.ll
+++ b/test/CodeGen/R600/mulhu.ll
@@ -1,7 +1,7 @@
  ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
  
  ;CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
-;CHECK: v_mul_hi_u32 v0, {{[sv][0-9]+}}, {{v[0-9]+}}
+;CHECK: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}}
  ;CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
  
  define void @test(i32 %p) {
diff --git a/test/CodeGen/R600/sdiv.ll b/test/CodeGen/R600/sdiv.ll

index 8d4208cb585c61798e069a17b4df636146e3c6aa..c635c0569e84bab44b64b00960fcbb4216d340d7 100644 (file)
--- a/test/CodeGen/R600/sdiv.ll
+++ b/test/CodeGen/R600/sdiv.ll
@@ -35,7 +35,7 @@ define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
  ; FUNC-LABEL: {{^}}slow_sdiv_i32_3435:
  ; SI: buffer_load_dword [[VAL:v[0-9]+]],
  ; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
-; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[VAL]], [[MAGIC]]
+; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]]
  ; SI: v_add_i32
  ; SI: v_lshrrev_b32
  ; SI: v_ashrrev_i32
diff --git a/test/CodeGen/R600/use-sgpr-multiple-times.ll b/test/CodeGen/R600/use-sgpr-multiple-times.ll

index 2c6ae1eae67ba3cf08d5b131fd1e3a4c03a62b5e..97d73ba74bc5a384a4462e40ed663c6112f3ef1e 100644 (file)
--- a/test/CodeGen/R600/use-sgpr-multiple-times.ll
+++ b/test/CodeGen/R600/use-sgpr-multiple-times.ll
@@ -41,7 +41,7 @@ define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, floa
  ; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
  ; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
  ; SI: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
  ; SI: buffer_store_dword [[RESULT]]
  define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
    %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
@@ -53,7 +53,7 @@ define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, floa
  ; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
  ; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
  ; SI: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
+; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
  ; SI: buffer_store_dword [[RESULT]]
  define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
    %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
author	Tom Stellard <thomas.stellard@amd.com>
	Wed, 7 Jan 2015 22:44:19 +0000 (22:44 +0000)
committer	Tom Stellard <thomas.stellard@amd.com>
	Wed, 7 Jan 2015 22:44:19 +0000 (22:44 +0000)
lib/Target/R600/SIFoldOperands.cpp		patch \| blob \| history
lib/Target/R600/SIInstrInfo.cpp		patch \| blob \| history
test/CodeGen/R600/mulhu.ll		patch \| blob \| history
test/CodeGen/R600/sdiv.ll		patch \| blob \| history
test/CodeGen/R600/use-sgpr-multiple-times.ll		patch \| blob \| history