AMDGPU: Fold frame index into MUBUF

author Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 24 Jun 2019 14:53:56 +0000 (14:53 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 24 Jun 2019 14:53:56 +0000 (14:53 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 24 Jun 2019 14:53:56 +0000 (14:53 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 24 Jun 2019 14:53:56 +0000 (14:53 +0000)
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp

index 3f566884f6b0ced841426a14981a497967b145f3..78e6e39b05a178f4e27c20b292ef3c31d56a49d0 100644 (file)
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -87,10 +87,11 @@ public:
    const SIInstrInfo *TII;
    const SIRegisterInfo *TRI;
    const GCNSubtarget *ST;
+  const SIMachineFunctionInfo *MFI;
  
    void foldOperand(MachineOperand &OpToFold,
                     MachineInstr *UseMI,
-                   unsigned UseOpIdx,
+                   int UseOpIdx,
                     SmallVectorImpl<FoldCandidate> &FoldList,
                     SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
  
@@ -159,6 +160,17 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
    }
  }
  
+// TODO: Add heuristic that the frame index might not fit in the addressing mode
+// immediate offset to avoid materializing in loops.
+static bool frameIndexMayFold(const SIInstrInfo *TII,
+                              const MachineInstr &UseMI,
+                              int OpNo,
+                              const MachineOperand &OpToFold) {
+  return OpToFold.isFI() &&
+    (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) &&
+    OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr);
+}
+
  FunctionPass *llvm::createSIFoldOperandsPass() {
    return new SIFoldOperands();
  }
@@ -290,7 +302,6 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
                               MachineOperand *OpToFold,
                               const SIInstrInfo *TII) {
    if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
-
      // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
      unsigned Opc = MI->getOpcode();
      if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
@@ -403,7 +414,7 @@ static bool isUseSafeToFold(const SIInstrInfo *TII,
  void SIFoldOperands::foldOperand(
    MachineOperand &OpToFold,
    MachineInstr *UseMI,
-  unsigned UseOpIdx,
+  int UseOpIdx,
    SmallVectorImpl<FoldCandidate> &FoldList,
    SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
    const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
@@ -453,10 +464,28 @@ void SIFoldOperands::foldOperand(
      return;
    }
  
+  if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
+    // Sanity check that this is a stack access.
+    // FIXME: Should probably use stack pseudos before frame lowering.
+    MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
+    if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() &&
+                           SOff->getReg() != MFI->getStackPtrOffsetReg()))
+      return;
+
+    if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
+        MFI->getScratchRSrcReg())
+      return;
  
-  bool FoldingImm = OpToFold.isImm();
+    // A frame index will resolve to a positive constant, so it should always be
+    // safe to fold the addressing mode, even pre-GFX9.
+    UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
+    SOff->setReg(MFI->getStackPtrOffsetReg());
+    return;
+  }
  
-  if (FoldingImm && UseMI->isCopy()) {
+  bool FoldingImmLike = OpToFold.isImm() || OpToFold.isFI();
+
+  if (FoldingImmLike && UseMI->isCopy()) {
      unsigned DestReg = UseMI->getOperand(0).getReg();
      const TargetRegisterClass *DestRC
        = TargetRegisterInfo::isVirtualRegister(DestReg) ?
@@ -517,7 +546,7 @@ void SIFoldOperands::foldOperand(
        // %sgpr = V_READFIRSTLANE_B32 %vgpr
        // =>
        // %sgpr = S_MOV_B32 imm
-      if (FoldingImm) {
+      if (FoldingImmLike) {
          if (execMayBeModifiedBeforeUse(*MRI,
                                         UseMI->getOperand(UseOpIdx).getReg(),
                                         *OpToFold.getParent(),
@@ -528,7 +557,10 @@ void SIFoldOperands::foldOperand(
  
          // FIXME: ChangeToImmediate should clear subreg
          UseMI->getOperand(1).setSubReg(0);
-        UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+        if (OpToFold.isImm())
+          UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+        else
+          UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
          UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
          return;
        }
@@ -560,7 +592,7 @@ void SIFoldOperands::foldOperand(
        return;
    }
  
-  if (!FoldingImm) {
+  if (!FoldingImmLike) {
      tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
  
      // FIXME: We could try to change the instruction from 64-bit to 32-bit
@@ -904,6 +936,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
        // in some cases. A better heuristic is needed.
        if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
          foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
+      } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
+        foldOperand(OpToFold, UseMI, OpNo, FoldList,
+                    CopiesToReplace);
        } else {
          if (++NumLiteralUses == 1) {
            NonInlineUse = &*Use;
@@ -1170,8 +1205,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
    ST = &MF.getSubtarget<GCNSubtarget>();
    TII = ST->getInstrInfo();
    TRI = &TII->getRegisterInfo();
-
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  MFI = MF.getInfo<SIMachineFunctionInfo>();
  
    // omod is ignored by hardware if IEEE bit is enabled. omod also does not
    // correctly handle signed zeros.
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h

index 64eb60b4690f4e8525e23f0c4508fd0e752921b5..ad15173294b6f30e4bbfd3338779cfe168c4d348 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -496,6 +496,11 @@ public:
      return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT);
    }
  
+  // FIXME: Make this more precise
+  static bool isFLATScratch(const MachineInstr &MI) {
+    return isSegmentSpecificFLAT(MI);
+  }
+
    // Any FLAT encoded instruction, including global_* and scratch_*.
    bool isFLAT(uint16_t Opcode) const {
      return get(Opcode).TSFlags & SIInstrFlags::FLAT;
diff --git a/test/CodeGen/AMDGPU/byval-frame-setup.ll b/test/CodeGen/AMDGPU/byval-frame-setup.ll

index 671c8257a744b6fa4c90dfb513d90dc3d303f46c..60694ba1f2feec80b0323be777e872a836502121 100644 (file)
--- a/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ b/test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -27,6 +27,47 @@ entry:
    ret void
  }
  
+; Make sure the offset is folded and function's frame register is used
+; rather than the global scratch wave offset.
+; GCN-LABEL: {{^}}void_func_byval_struct_use_outside_entry_block:
+; GCN-NOT: v_lshrrev_b32
+; GCN-NOT: s_sub_u32
+
+; GCN: s_and_saveexec_b64
+; GCN: s_cbranch_execz [[BB1:BB[0-9]+_[0-9]+]]
+
+; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
+; GCN-NOT: s32
+
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}}
+; GCN-NOT: s32
+
+; GCN: [[BB1]]
+; GCN: s_or_b64 exec, exec
+define hidden void @void_func_byval_struct_use_outside_entry_block(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1, i1 %cond) #1 {
+entry:
+  br i1 %cond, label %bb0, label %bb1
+
+bb0:
+  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
+  %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4
+  %add = add nsw i32 %tmp, 1
+  store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
+  %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 4
+  %add3 = add nsw i32 %tmp1, 2
+  store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 4
+  store volatile i32 9, i32 addrspace(1)* null, align 4
+  br label %bb1
+
+bb1:
+  ret void
+}
+
  ; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf:
  ; GCN: s_mov_b32 s5, s32
  ; GCN: s_add_u32 s32, s32, 0xc00{{$}}
diff --git a/test/CodeGen/AMDGPU/fold-fi-mubuf.mir b/test/CodeGen/AMDGPU/fold-fi-mubuf.mir

new file mode 100644 (file)

index 0000000..a015a1e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fold-fi-mubuf.mir
@@ -0,0 +1,134 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination  %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: no_fold_fi_non_stack_rsrc_soffset
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  scratchWaveOffsetReg: '$sgpr6'
+  frameOffsetReg:  '$sgpr6'
+  stackPtrOffsetReg: '$sgpr6'
+body:             |
+  bb.0:
+    liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc_soffset
+    ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
+    ; GCN: SI_RETURN_TO_EPILOG $vgpr0
+    %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    %1:sreg_32_xm0 = S_MOV_B32 0
+    %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %3
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...
+
+---
+name: no_fold_fi_non_stack_rsrc
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  scratchWaveOffsetReg: '$sgpr6'
+  frameOffsetReg:  '$sgpr6'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc
+    ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
+    ; GCN: SI_RETURN_TO_EPILOG $vgpr0
+    %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %3
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...
+
+# Offset is from global scratch wave offset.
+---
+name: fold_fi_mubuf_scratch_scratch_wave_offset
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  scratchWaveOffsetReg: '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_fi_mubuf_scratch_scratch_wave_offset
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ; GCN: S_ENDPGM 0, implicit $vgpr0
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+
+    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec
+    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %2
+    S_ENDPGM 0, implicit $vgpr0
+
+...
+
+---
+name: no_fold_fi_mubuf_scratch_sp_offset
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  scratchWaveOffsetReg: '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: no_fold_fi_mubuf_scratch_sp_offset
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ; GCN: S_ENDPGM 0, implicit $vgpr0
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+
+    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %2
+    S_ENDPGM 0, implicit $vgpr0
+
+...
diff --git a/test/CodeGen/AMDGPU/frame-index-elimination.ll b/test/CodeGen/AMDGPU/frame-index-elimination.ll

index bf49988f00cb8c0c3203248a93a4026fb2ca89c8..917b3ca91ec8e38495737c368fa6d276a8277d7b 100644 (file)
--- a/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -144,9 +144,6 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* b
    ret void
  }
  
-; FIXME: Should be able to see that this can use vaddr, but the
-; FrameIndex is hidden behind a CopyFromReg in the second block.
-
  ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
  ; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s33
  
@@ -156,13 +153,13 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* b
  
  ; GCN: s_and_saveexec_b64
  
-; CI: v_add_i32_e32 v0, vcc, 4, [[SHIFT]]
-; CI: buffer_load_dword v1, v1, s[0:3], s33 offen offset:4{{$}}
+; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]]
+; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
  
-; GFX9: v_add_u32_e32 v0, 4, [[SHIFT]]
-; GFX9: buffer_load_dword v1, v{{[0-9]+}}, s[0:3], s33 offen offset:4{{$}}
+; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SHIFT]]
+; GFX9: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
  
-; GCN: ds_write_b32
+; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]]
  define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval %arg0, i32 %arg2) #0 {
    %cmp = icmp eq i32 %arg2, 0
    br i1 %cmp, label %bb, label %ret
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll

index 5ccc708245e643b89ee2d08404597a8c3fbeaf6c..561037e4b24e984de11ba996e06f0ac67fca081b 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
@@ -440,6 +440,32 @@ main_body:
    ret float %val
  }
  
+; Make sure a frame index folding doessn't crash on a MUBUF not used
+; for stack access.
+
+; CHECK-LABEL: {{^}}no_fold_fi_imm_soffset:
+; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
+; CHECK-NEXT: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen
+define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) {
+  %alloca = alloca i32, addrspace(5)
+  %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32
+
+  %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 0, i1 false, i1 false)
+  ret float %ret.val
+}
+
+; CHECK-LABEL: {{^}}no_fold_fi_reg_soffset:
+; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 4{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s
+; CHECK: buffer_load_dword v0, v{{\[}}[[FI]]:[[HI]]
+define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
+  %alloca = alloca i32, addrspace(5)
+  %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32
+
+  %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 %soffset, i1 false, i1 false)
+  ret float %ret.val
+}
+
  declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
  declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
  declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll

index d03f97a06978aa430a013359d49669431f99764e..f28f40c321326d8ca341cdf5a4129211b0cc74c2 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -60,8 +60,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(i32 addrspace(1)* %
  
  ; Make sure this doesn't crash.
  ; CHECK-LABEL: {{^}}test_readfirstlane_fi:
-; CHECK: v_mov_b32_e32 [[FIVAL:v[0-9]]], 4
-; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[FIVAL]]
+; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 4
  define amdgpu_kernel void @test_readfirstlane_fi(i32 addrspace(1)* %out) #1 {
    %alloca = alloca i32, addrspace(5)
    %int = ptrtoint i32 addrspace(5)* %alloca to i32
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 24 Jun 2019 14:53:56 +0000 (14:53 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 24 Jun 2019 14:53:56 +0000 (14:53 +0000)
lib/Target/AMDGPU/SIFoldOperands.cpp		patch \| blob \| history
lib/Target/AMDGPU/SIInstrInfo.h		patch \| blob \| history
test/CodeGen/AMDGPU/byval-frame-setup.ll		patch \| blob \| history
test/CodeGen/AMDGPU/fold-fi-mubuf.mir	[new file with mode: 0644]	patch \| blob
test/CodeGen/AMDGPU/frame-index-elimination.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll		patch \| blob \| history