From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 24 Jun 2019 14:53:56 +0000 (+0000)
Subject: AMDGPU: Fold frame index into MUBUF
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=68420a23e066a2c6a641d77ab0d8e43897beb982;p=llvm

AMDGPU: Fold frame index into MUBUF

This matters for byval uses outside of the entry block, which appear
as copies.

Previously, the only folding done was during selection, which could
not see the underlying frame index. For any uses outside the entry
block, the frame index was materialized in the entry block relative to
the global scratch wave offset.

This may produce worse code in cases where the offset ends up not
fitting in the MUBUF offset field. A better heuristic would be helpfu
for extreme frames.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@364185 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 3f566884f6b..78e6e39b05a 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -87,10 +87,11 @@ public:
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   const GCNSubtarget *ST;
+  const SIMachineFunctionInfo *MFI;
 
   void foldOperand(MachineOperand &OpToFold,
                    MachineInstr *UseMI,
-                   unsigned UseOpIdx,
+                   int UseOpIdx,
                    SmallVectorImpl<FoldCandidate> &FoldList,
                    SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
 
@@ -159,6 +160,17 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
   }
 }
 
+// TODO: Add heuristic that the frame index might not fit in the addressing mode
+// immediate offset to avoid materializing in loops.
+static bool frameIndexMayFold(const SIInstrInfo *TII,
+                              const MachineInstr &UseMI,
+                              int OpNo,
+                              const MachineOperand &OpToFold) {
+  return OpToFold.isFI() &&
+    (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) &&
+    OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr);
+}
+
 FunctionPass *llvm::createSIFoldOperandsPass() {
   return new SIFoldOperands();
 }
@@ -290,7 +302,6 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
                              MachineOperand *OpToFold,
                              const SIInstrInfo *TII) {
   if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
-
     // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
     unsigned Opc = MI->getOpcode();
     if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
@@ -403,7 +414,7 @@ static bool isUseSafeToFold(const SIInstrInfo *TII,
 void SIFoldOperands::foldOperand(
   MachineOperand &OpToFold,
   MachineInstr *UseMI,
-  unsigned UseOpIdx,
+  int UseOpIdx,
   SmallVectorImpl<FoldCandidate> &FoldList,
   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
   const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
@@ -453,10 +464,28 @@ void SIFoldOperands::foldOperand(
     return;
   }
 
+  if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
+    // Sanity check that this is a stack access.
+    // FIXME: Should probably use stack pseudos before frame lowering.
+    MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
+    if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() &&
+                           SOff->getReg() != MFI->getStackPtrOffsetReg()))
+      return;
+
+    if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
+        MFI->getScratchRSrcReg())
+      return;
 
-  bool FoldingImm = OpToFold.isImm();
+    // A frame index will resolve to a positive constant, so it should always be
+    // safe to fold the addressing mode, even pre-GFX9.
+    UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
+    SOff->setReg(MFI->getStackPtrOffsetReg());
+    return;
+  }
 
-  if (FoldingImm && UseMI->isCopy()) {
+  bool FoldingImmLike = OpToFold.isImm() || OpToFold.isFI();
+
+  if (FoldingImmLike && UseMI->isCopy()) {
     unsigned DestReg = UseMI->getOperand(0).getReg();
     const TargetRegisterClass *DestRC
       = TargetRegisterInfo::isVirtualRegister(DestReg) ?
@@ -517,7 +546,7 @@ void SIFoldOperands::foldOperand(
       // %sgpr = V_READFIRSTLANE_B32 %vgpr
       // =>
       // %sgpr = S_MOV_B32 imm
-      if (FoldingImm) {
+      if (FoldingImmLike) {
         if (execMayBeModifiedBeforeUse(*MRI,
                                        UseMI->getOperand(UseOpIdx).getReg(),
                                        *OpToFold.getParent(),
@@ -528,7 +557,10 @@ void SIFoldOperands::foldOperand(
 
         // FIXME: ChangeToImmediate should clear subreg
         UseMI->getOperand(1).setSubReg(0);
-        UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+        if (OpToFold.isImm())
+          UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
+        else
+          UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
         UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
         return;
       }
@@ -560,7 +592,7 @@ void SIFoldOperands::foldOperand(
       return;
   }
 
-  if (!FoldingImm) {
+  if (!FoldingImmLike) {
     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
 
     // FIXME: We could try to change the instruction from 64-bit to 32-bit
@@ -904,6 +936,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
       // in some cases. A better heuristic is needed.
       if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
         foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
+      } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
+        foldOperand(OpToFold, UseMI, OpNo, FoldList,
+                    CopiesToReplace);
       } else {
         if (++NumLiteralUses == 1) {
           NonInlineUse = &*Use;
@@ -1170,8 +1205,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
   ST = &MF.getSubtarget<GCNSubtarget>();
   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
-
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   // omod is ignored by hardware if IEEE bit is enabled. omod also does not
   // correctly handle signed zeros.
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 64eb60b4690..ad15173294b 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -496,6 +496,11 @@ public:
     return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT);
   }
 
+  // FIXME: Make this more precise
+  static bool isFLATScratch(const MachineInstr &MI) {
+    return isSegmentSpecificFLAT(MI);
+  }
+
   // Any FLAT encoded instruction, including global_* and scratch_*.
   bool isFLAT(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FLAT;
diff --git a/test/CodeGen/AMDGPU/byval-frame-setup.ll b/test/CodeGen/AMDGPU/byval-frame-setup.ll
index 671c8257a74..60694ba1f2f 100644
--- a/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ b/test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -27,6 +27,47 @@ entry:
   ret void
 }
 
+; Make sure the offset is folded and function's frame register is used
+; rather than the global scratch wave offset.
+; GCN-LABEL: {{^}}void_func_byval_struct_use_outside_entry_block:
+; GCN-NOT: v_lshrrev_b32
+; GCN-NOT: s_sub_u32
+
+; GCN: s_and_saveexec_b64
+; GCN: s_cbranch_execz [[BB1:BB[0-9]+_[0-9]+]]
+
+; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
+; GCN-NOT: s32
+
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}}
+; GCN-NOT: s32
+
+; GCN: [[BB1]]
+; GCN: s_or_b64 exec, exec
+define hidden void @void_func_byval_struct_use_outside_entry_block(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1, i1 %cond) #1 {
+entry:
+  br i1 %cond, label %bb0, label %bb1
+
+bb0:
+  %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
+  %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4
+  %add = add nsw i32 %tmp, 1
+  store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
+  %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 4
+  %add3 = add nsw i32 %tmp1, 2
+  store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 4
+  store volatile i32 9, i32 addrspace(1)* null, align 4
+  br label %bb1
+
+bb1:
+  ret void
+}
+
 ; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf:
 ; GCN: s_mov_b32 s5, s32
 ; GCN: s_add_u32 s32, s32, 0xc00{{$}}
diff --git a/test/CodeGen/AMDGPU/fold-fi-mubuf.mir b/test/CodeGen/AMDGPU/fold-fi-mubuf.mir
new file mode 100644
index 00000000000..a015a1ef4d1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fold-fi-mubuf.mir
@@ -0,0 +1,134 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination  %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name: no_fold_fi_non_stack_rsrc_soffset
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  scratchWaveOffsetReg: '$sgpr6'
+  frameOffsetReg:  '$sgpr6'
+  stackPtrOffsetReg: '$sgpr6'
+body:             |
+  bb.0:
+    liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc_soffset
+    ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
+    ; GCN: SI_RETURN_TO_EPILOG $vgpr0
+    %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    %1:sreg_32_xm0 = S_MOV_B32 0
+    %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %3
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...
+
+---
+name: no_fold_fi_non_stack_rsrc
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  scratchWaveOffsetReg: '$sgpr6'
+  frameOffsetReg:  '$sgpr6'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc
+    ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
+    ; GCN: SI_RETURN_TO_EPILOG $vgpr0
+    %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15
+    %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %3
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...
+
+# Offset is from global scratch wave offset.
+---
+name: fold_fi_mubuf_scratch_scratch_wave_offset
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  scratchWaveOffsetReg: '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_fi_mubuf_scratch_scratch_wave_offset
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ; GCN: S_ENDPGM 0, implicit $vgpr0
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+
+    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec
+    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %2
+    S_ENDPGM 0, implicit $vgpr0
+
+...
+
+---
+name: no_fold_fi_mubuf_scratch_sp_offset
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+  localFrameSize:  4
+stack:
+  - { id: 0, size: 4, alignment: 4, local-offset: 0 }
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  scratchWaveOffsetReg: '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: no_fold_fi_mubuf_scratch_sp_offset
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+    ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ; GCN: S_ENDPGM 0, implicit $vgpr0
+    %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec
+
+    BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec
+    $vgpr0 = COPY %2
+    S_ENDPGM 0, implicit $vgpr0
+
+...
diff --git a/test/CodeGen/AMDGPU/frame-index-elimination.ll b/test/CodeGen/AMDGPU/frame-index-elimination.ll
index bf49988f00c..917b3ca91ec 100644
--- a/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -144,9 +144,6 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* b
   ret void
 }
 
-; FIXME: Should be able to see that this can use vaddr, but the
-; FrameIndex is hidden behind a CopyFromReg in the second block.
-
 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
 ; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s33
 
@@ -156,13 +153,13 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* b
 
 ; GCN: s_and_saveexec_b64
 
-; CI: v_add_i32_e32 v0, vcc, 4, [[SHIFT]]
-; CI: buffer_load_dword v1, v1, s[0:3], s33 offen offset:4{{$}}
+; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]]
+; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
 
-; GFX9: v_add_u32_e32 v0, 4, [[SHIFT]]
-; GFX9: buffer_load_dword v1, v{{[0-9]+}}, s[0:3], s33 offen offset:4{{$}}
+; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SHIFT]]
+; GFX9: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
 
-; GCN: ds_write_b32
+; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]]
 define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval %arg0, i32 %arg2) #0 {
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %bb, label %ret
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
index 5ccc708245e..561037e4b24 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
@@ -440,6 +440,32 @@ main_body:
   ret float %val
 }
 
+; Make sure a frame index folding doessn't crash on a MUBUF not used
+; for stack access.
+
+; CHECK-LABEL: {{^}}no_fold_fi_imm_soffset:
+; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
+; CHECK-NEXT: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen
+define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) {
+  %alloca = alloca i32, addrspace(5)
+  %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32
+
+  %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 0, i1 false, i1 false)
+  ret float %ret.val
+}
+
+; CHECK-LABEL: {{^}}no_fold_fi_reg_soffset:
+; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 4{{$}}
+; CHECK-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s
+; CHECK: buffer_load_dword v0, v{{\[}}[[FI]]:[[HI]]
+define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
+  %alloca = alloca i32, addrspace(5)
+  %alloca.cast = ptrtoint i32 addrspace(5)* %alloca to i32
+
+  %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 %soffset, i1 false, i1 false)
+  ret float %ret.val
+}
+
 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index d03f97a0697..f28f40c3213 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -60,8 +60,7 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr(i32 addrspace(1)* %
 
 ; Make sure this doesn't crash.
 ; CHECK-LABEL: {{^}}test_readfirstlane_fi:
-; CHECK: v_mov_b32_e32 [[FIVAL:v[0-9]]], 4
-; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[FIVAL]]
+; CHECK: s_mov_b32 [[FIVAL:s[0-9]]], 4
 define amdgpu_kernel void @test_readfirstlane_fi(i32 addrspace(1)* %out) #1 {
   %alloca = alloca i32, addrspace(5)
   %int = ptrtoint i32 addrspace(5)* %alloca to i32