AMDGPU: Default to using scalar mov to materialize immediate

author Matt Arsenault <Matthew.Arsenault@amd.com>

Tue, 1 Nov 2016 22:55:07 +0000 (22:55 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Tue, 1 Nov 2016 22:55:07 +0000 (22:55 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Tue, 1 Nov 2016 22:55:07 +0000 (22:55 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Tue, 1 Nov 2016 22:55:07 +0000 (22:55 +0000)
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp

index 905ee4673191981cd833ba28a3223d19b2fcac05..108995a463f14e51ef81b9865b0cffb784d60d55 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1943,6 +1943,28 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
      return false;
    }
  
+  if (MI.isInlineAsm()) {
+    // Verify register classes for inlineasm constraints.
+    for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
+         I != E; ++I) {
+      const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
+      if (!RC)
+        continue;
+
+      const MachineOperand &Op = MI.getOperand(I);
+      if (!Op.isReg())
+        continue;
+
+      unsigned Reg = Op.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
+        ErrInfo = "inlineasm operand has incorrect register class.";
+        return false;
+      }
+    }
+
+    return true;
+  }
+
    // Make sure the register classes are correct.
    for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
      if (MI.getOperand(i).isFPImm()) {
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td

index a56d909624f1ab5dba0659c5c95fe55cdbba0bec..f19e99e7cd116d2152991b7af5cf2e551f837e4b 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -265,19 +265,25 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
    return isInlineImmediate(N);
  }]>;
  
-class SGPRImm <dag frag> : PatLeaf<frag, [{
+class VGPRImm <dag frag> : PatLeaf<frag, [{
    if (Subtarget->getGeneration() < SISubtarget::SOUTHERN_ISLANDS) {
      return false;
    }
    const SIRegisterInfo *SIRI =
        static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+  unsigned Limit = 0;
    for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
-                                                U != E; ++U) {
+         Limit < 10 && U != E; ++U, ++Limit) {
      const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
-    if (RC && SIRI->isSGPRClass(RC))
-      return true;
+
+    // If the register class is unknown, it could be an unknown
+    // register class that needs to be an SGPR, e.g. an inline asm
+    // constraint
+    if (!RC || SIRI->isSGPRClass(RC))
+      return false;
    }
-  return false;
+
+  return Limit < 10;
  }]>;
  
  //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td

index f3ccee288fd84055a3806922a5cda498360cc759..4122eb915f39324e810b232b392a8be822a07212 100644 (file)
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -615,23 +615,23 @@ def : Pat <
  /********** ================== **********/
  
  def : Pat <
-  (SGPRImm<(i32 imm)>:$imm),
-  (S_MOV_B32 imm:$imm)
+  (VGPRImm<(i32 imm)>:$imm),
+  (V_MOV_B32_e32 imm:$imm)
  >;
  
  def : Pat <
-  (SGPRImm<(f32 fpimm)>:$imm),
-  (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
+  (VGPRImm<(f32 fpimm)>:$imm),
+  (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
  >;
  
  def : Pat <
    (i32 imm:$imm),
-  (V_MOV_B32_e32 imm:$imm)
+  (S_MOV_B32 imm:$imm)
  >;
  
  def : Pat <
    (f32 fpimm:$imm),
-  (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
+  (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
  >;
  
  def : Pat <
diff --git a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll

index ad5e89083401ce951891085dfd90642e9499483a..27b3cc0e435e42b44539d1edc7a2bd25ad0b1e55 100644 (file)
--- a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -15,7 +15,7 @@
  ; GCN: s_mov_b32 m0, -1
  ; GCN: ds_read_b32 [[LOAD0:v[0-9]+]]
  
-; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0,
+; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], s{{[0-9]+}}, v0
  ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
  ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, [[CMP0]]
  ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
@@ -93,24 +93,24 @@ endif:
  ; GCN: s_mov_b32 m0, -1
  ; GCN: ds_read_b32 [[LOAD0:v[0-9]+]]
  
-; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0,
+; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], s{{[0-9]+}}, v0
  
  ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
  ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]]
  ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
  
+; Spill load
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 ; 4-byte Folded Spill
+
  ; Spill saved exec
  ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
  ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
  
  
  ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:12 ; 8-byte Folded Spill
  ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 8-byte Folded Spill
-
-; Spill load
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:16 ; 8-byte Folded Spill
  
  ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
  
@@ -120,7 +120,7 @@ endif:
  
  
  ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]:
-; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
  ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]]
  ; GCN: v_cmp_ne_u32_e32 vcc,
  ; GCN: s_and_b64 vcc, exec, vcc
@@ -133,11 +133,11 @@ endif:
  ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
  ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
  
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:12 ; 8-byte Folded Reload
  ; VMEM: s_waitcnt vmcnt(0)
  ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
  
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:16 ; 8-byte Folded Reload
  ; VMEM: s_waitcnt vmcnt(0)
  ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
  
diff --git a/test/CodeGen/AMDGPU/inline-constraints.ll b/test/CodeGen/AMDGPU/inline-constraints.ll

index 7282e89e85be0e2fb1b0bc2fd1d3729e18843aad..3c0bb75a607499e54ac1061cf3515161eb80fb2a 100644 (file)
--- a/test/CodeGen/AMDGPU/inline-constraints.ll
+++ b/test/CodeGen/AMDGPU/inline-constraints.ll
@@ -26,8 +26,43 @@ entry:
  ; GCN: s_mov_b32 m0, -1
  ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, m0
  ; GCN: ; use m0
-define void @inline_sreg_constraint_m0(i32 addrspace(1)* %ptr) {
+define void @inline_sreg_constraint_m0() {
    %m0 = tail call i32 asm sideeffect "s_mov_b32 m0, -1", "={M0}"()
    tail call void asm sideeffect "; use $0", "s"(i32 %m0)
    ret void
  }
+
+; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i32:
+; GCN: s_mov_b32 [[REG:s[0-9]+]], 32
+; GCN: ; use [[REG]]
+define void @inline_sreg_constraint_imm_i32() {
+  tail call void asm sideeffect "; use $0", "s"(i32 32)
+  ret void
+}
+
+; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f32:
+; GCN: s_mov_b32 [[REG:s[0-9]+]], 1.0
+; GCN: ; use [[REG]]
+define void @inline_sreg_constraint_imm_f32() {
+  tail call void asm sideeffect "; use $0", "s"(float 1.0)
+  ret void
+}
+
+; FIXME: Should be able to use s_mov_b64
+; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64:
+; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], -4{{$}}
+; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], -1{{$}}
+; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
+define void @inline_sreg_constraint_imm_i64() {
+  tail call void asm sideeffect "; use $0", "s"(i64 -4)
+  ret void
+}
+
+; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f64:
+; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0{{$}}
+; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 0x3ff00000{{$}}
+; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
+define void @inline_sreg_constraint_imm_f64() {
+  tail call void asm sideeffect "; use $0", "s"(double 1.0)
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll

index 0cdb1c9fb3a67e888a911d82d9e8b67e760d3d88..37da9c5d5adb3eab6506b2d88244104ea72e6cf1 100644 (file)
--- a/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -15,7 +15,7 @@
  ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
  ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
  ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 0x40a00000
+; GCN-DAG: s_mov_b32 [[CONSTREG:s[0-9]+]], 0x40a00000
  ; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]]
  ; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]:
  define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Tue, 1 Nov 2016 22:55:07 +0000 (22:55 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Tue, 1 Nov 2016 22:55:07 +0000 (22:55 +0000)
lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIInstrInfo.td		patch \| blob \| history
lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
test/CodeGen/AMDGPU/control-flow-fastregalloc.ll		patch \| blob \| history
test/CodeGen/AMDGPU/inline-constraints.ll		patch \| blob \| history
test/CodeGen/AMDGPU/insert_vector_elt.ll		patch \| blob \| history