AMDGPU: Fix broken FrameIndex handling

author Matt Arsenault <Matthew.Arsenault@amd.com>

Sat, 17 Sep 2016 16:09:55 +0000 (16:09 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Sat, 17 Sep 2016 16:09:55 +0000 (16:09 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Sat, 17 Sep 2016 16:09:55 +0000 (16:09 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Sat, 17 Sep 2016 16:09:55 +0000 (16:09 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

index 951db65efbba1ca4ad7dfee3b3f46f9165e8339d..bf3e1da515633b9716c2ad03957a9588372e2d4a 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -51,10 +51,10 @@ public:
    bool runOnMachineFunction(MachineFunction &MF) override;
    void Select(SDNode *N) override;
    const char *getPassName() const override;
-  void PreprocessISelDAG() override;
    void PostprocessISelDAG() override;
  
  private:
+  SDValue foldFrameIndex(SDValue N) const;
    bool isInlineImmediate(const SDNode *N) const;
    bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
                     const R600InstrInfo *TII);
@@ -902,6 +902,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
    return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE);
  }
  
+SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+  if (auto FI = dyn_cast<FrameIndexSDNode>(N))
+    return CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
+  return N;
+}
+
  bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
                                              SDValue &VAddr, SDValue &SOffset,
                                              SDValue &ImmOffset) const {
@@ -921,14 +927,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
      // Offsets in vaddr must be positive.
      ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
      if (isLegalMUBUFImmOffset(C1)) {
-      VAddr = N0;
+      VAddr = foldFrameIndex(N0);
        ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
        return true;
      }
    }
  
    // (node)
-  VAddr = Addr;
+  VAddr = foldFrameIndex(Addr);
    ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
    return true;
  }
@@ -1516,62 +1522,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
    return SelectVOP3Mods(In, Src, SrcMods);
  }
  
-void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
-  MachineFrameInfo &MFI = CurDAG->getMachineFunction().getFrameInfo();
-
-  // Handle the perverse case where a frame index is being stored. We don't
-  // want to see multiple frame index operands on the same instruction since
-  // it complicates things and violates some assumptions about frame index
-  // lowering.
-  for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
-       I != E; ++I) {
-    SDValue FI = CurDAG->getTargetFrameIndex(I, MVT::i32);
-
-    // It's possible that we have a frame index defined in the function that
-    // isn't used in this block.
-    if (FI.use_empty())
-      continue;
-
-    // Skip over the AssertZext inserted during lowering.
-    SDValue EffectiveFI = FI;
-    auto It = FI->use_begin();
-    if (It->getOpcode() == ISD::AssertZext && FI->hasOneUse()) {
-      EffectiveFI = SDValue(*It, 0);
-      It = EffectiveFI->use_begin();
-    }
-
-    for (auto It = EffectiveFI->use_begin(); !It.atEnd(); ) {
-      SDUse &Use = It.getUse();
-      SDNode *User = Use.getUser();
-      unsigned OpIdx = It.getOperandNo();
-      ++It;
-
-      if (MemSDNode *M = dyn_cast<MemSDNode>(User)) {
-        unsigned PtrIdx = M->getOpcode() == ISD::STORE ? 2 : 1;
-        if (OpIdx == PtrIdx)
-          continue;
-
-        unsigned OpN = M->getNumOperands();
-        SDValue NewOps[8];
-
-        assert(OpN < array_lengthof(NewOps));
-        for (unsigned Op = 0; Op != OpN; ++Op) {
-          if (Op != OpIdx) {
-            NewOps[Op] = M->getOperand(Op);
-            continue;
-          }
-
-          MachineSDNode *Mov = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
-                                                      SDLoc(M), MVT::i32, FI);
-          NewOps[Op] = SDValue(Mov, 0);
-        }
-
-        CurDAG->UpdateNodeOperands(M, makeArrayRef(NewOps, OpN));
-      }
-    }
-  }
-}
-
  void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
    const AMDGPUTargetLowering& Lowering =
      *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp

index 4570fe585d118b7f0cfe3de8b30930edc15a0e40..f8d4e6131b0dadad25119767768dc69cd20a002e 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -89,7 +89,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
  
    setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
    setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-  setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
    setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
  
    setOperationAction(ISD::SELECT, MVT::i1, Promote);
@@ -1558,7 +1557,6 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
  SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    switch (Op.getOpcode()) {
    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
    case ISD::BRCOND: return LowerBRCOND(Op, DAG);
    case ISD::LOAD: {
      SDValue Result = LowerLOAD(Op, DAG);
@@ -1605,43 +1603,6 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
    return nullptr;
  }
  
-SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
-
-  SDLoc SL(Op);
-  FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
-  unsigned FrameIndex = FINode->getIndex();
-
-  // A FrameIndex node represents a 32-bit offset into scratch memory. If the
-  // high bit of a frame index offset were to be set, this would mean that it
-  // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
-  // buffer, with 64 being the number of threads per wave.
-  //
-  // The maximum private allocation for the entire GPU is 4G, and we are
-  // concerned with the largest the index could ever be for an individual
-  // workitem. This will occur with the minmum dispatch size. If a program
-  // requires more, the dispatch size will be reduced.
-  //
-  // With this limit, we can mark the high bit of the FrameIndex node as known
-  // zero, which is important, because it means in most situations we can prove
-  // that values derived from FrameIndex nodes are non-negative. This enables us
-  // to take advantage of more addressing modes when accessing scratch buffers,
-  // since for scratch reads/writes, the register offset must always be
-  // positive.
-
-  uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;
-
-  // XXX - It is unclear if partial dispatch works. Assume it works at half wave
-  // granularity. It is probably a full wave.
-  uint64_t MinGranularity = 32;
-
-  unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
-  EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
-
-  SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
-  return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
-                     DAG.getValueType(ExtVT));
-}
-
  bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
    if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
      switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h

index 06e7da63a8fad17d3662655eec7742a008b53d11..b65f95f7854a166455e9295b6cf48f390e88b11d 100644 (file)
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -33,7 +33,6 @@ class SITargetLowering final : public AMDGPUTargetLowering {
    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
    SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
    SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td

index 37f8f17bff3afa5a4148e8d9e5f6794f112aa6ca..d04ff6a86eccb89e20f16b7830f0a5b36ee5597b 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -278,6 +278,11 @@ return CurDAG->getTargetConstant(
    N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);
  }]>;
  
+def frameindex_to_targetframeindex : SDNodeXForm<frameindex, [{
+  auto FI = cast<FrameIndexSDNode>(N);
+  return CurDAG->getTargetFrameIndex(FI->getIndex(), MVT::i32);
+}]>;
+
  // Copied from the AArch64 backend:
  def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
  return CurDAG->getTargetConstant(
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td

index 7daa1032f0588ac99ffc17a02df86eb6f165a4e1..86c3fd6815ea2813751073e1075784d53f61028f 100644 (file)
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -1581,6 +1581,11 @@ def : Pat <
    (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
  >;
  
+def : Pat <
+ (i32 frameindex:$fi),
+ (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
+>;
+
  def : Pat <
    (i64 InlineImm<i64>:$imm),
    (S_MOV_B64 InlineImm<i64>:$imm)
diff --git a/test/CodeGen/AMDGPU/captured-frame-index.ll b/test/CodeGen/AMDGPU/captured-frame-index.ll

index 161c46b486eb94ae9cc1401f079f096eed346bc9..55b323056870197e0b075c55d21db0dac2ad00e1 100644 (file)
--- a/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -1,5 +1,17 @@
  ; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
  
+; GCN-LABEL: {{^}}store_fi_lifetime:
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[FI]]
+define void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
+entry:
+  %b = alloca i8
+  call void @llvm.lifetime.start(i64 1, i8* %b)
+  store volatile i8* %b, i8* addrspace(1)* undef
+  call void @llvm.lifetime.end(i64 1, i8* %b)
+  ret void
+}
+
  ; GCN-LABEL: {{^}}stored_fi_to_lds:
  ; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
  ; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
@@ -140,17 +152,18 @@ define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
  }
  
  ; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset:
-; GCN: s_add_i32 [[BASE_1_OFF_0:s[0-9]+]], 0, 0x3ffc
+; GCN: v_mov_b32_e32 [[VAL_0:v[0-9]+]], 0{{$}}
  ; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}}
-; GCN: buffer_store_dword [[BASE_0]], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; GCN: buffer_store_dword [[VAL_0]], [[BASE_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+
+; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 0{{$}}
+; GCN: v_add_i32_e32 [[BASE_1_OFF_0:v[0-9]+]], vcc, 0x3ffc, [[BASE_0_1]]
  
-; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_0:v[0-9]+]], [[BASE_1_OFF_0]]
  ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
-; GCN: s_add_i32 [[BASE_1_OFF_1:s[0-9]+]], 0, 56
-; GCN: buffer_store_dword [[K]], [[V_BASE_1_OFF_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
+; GCN: v_add_i32_e32 [[BASE_1_OFF_1:v[0-9]+]], vcc, 56, [[BASE_0_1]]
+; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
  
-; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_1:v[0-9]+]], [[BASE_1_OFF_1]]
-; GCN: buffer_store_dword [[V_BASE_1_OFF_1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+; GCN: buffer_store_dword [[BASE_1_OFF_1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
  define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
    %tmp0 = alloca [4096 x i32]
    %tmp1 = alloca [4096 x i32]
@@ -163,4 +176,27 @@ define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
    ret void
  }
  
+@g1 = external addrspace(1) global i32*
+
+; This was leaving a dead node around resulting in failing to select
+; on the leftover AssertZext's ValueType operand.
+
+; GCN-LABEL: {{^}}cannot_select_assertzext_valuetype:
+; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, g1@GOTPCREL+4
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[FI]]
+define void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
+entry:
+  %b = alloca i32, align 4
+  %tmp1 = load volatile i32*, i32* addrspace(1)* @g1, align 4
+  %arrayidx = getelementptr inbounds i32, i32* %tmp1, i32 %idx
+  %tmp2 = load i32, i32* %arrayidx, align 4
+  store volatile i32* %b, i32* addrspace(1)* undef
+  ret void
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
  attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll

index eb554e2173059b584d05f7c0404506f80e18581d..105d20511b810a6b6b9aec366dfc3eea55f397a8 100644 (file)
--- a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
+++ b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
@@ -7,8 +7,10 @@
  ;
  ; CHECK-LABEL: {{^}}main:
  ; CHECK: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
-; CHECK: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]]
-; CHECK: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]]
+; CHECK-DAG: v_mov_b32_e32 [[ZERO_BASE_FI:v[0-9]+]], 0{{$}}
+; CHECK-DAG: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]]
+; CHECK-DAG: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]]
+
  ; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
  ; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
  define amdgpu_ps float @main(i32 %idx) {
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Sat, 17 Sep 2016 16:09:55 +0000 (16:09 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Sat, 17 Sep 2016 16:09:55 +0000 (16:09 +0000)
lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIISelLowering.h		patch \| blob \| history
lib/Target/AMDGPU/SIInstrInfo.td		patch \| blob \| history
lib/Target/AMDGPU/SIInstructions.td		patch \| blob \| history
test/CodeGen/AMDGPU/captured-frame-index.ll		patch \| blob \| history
test/CodeGen/AMDGPU/local-stack-slot-bug.ll		patch \| blob \| history