From 6b43815b282eb747b6da31f56291033d867a9589 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Mon, 29 Jul 2019 16:40:58 +0000 Subject: [PATCH] AMDGPU/LoadStoreOptimizer: combine MMOs when merging instructions Summary: The LoadStoreOptimizer was creating instructions with 2 MachineMemOperands, which meant they were assumed to alias with all other instructions, because MachineInstr:mayAlias() returns true when an instruction has multiple MachineMemOperands. This was preventing these instructions from being merged again, and was giving the scheduler less freedom to reorder them. Reviewers: arsenm, nhaehnle Reviewed By: arsenm Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D65036 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@367237 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 41 +++++++++++++++++-- test/CodeGen/AMDGPU/merge-load-store.mir | 23 +++++++++++ .../CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir | 4 +- 3 files changed, 63 insertions(+), 5 deletions(-) diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index ae8b967893a..5a988dd3ae4 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -307,6 +307,16 @@ static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, return true; } +// This function assumes that \p A and \p B have are identical except for +// size and offset, and they referecne adjacent memory. +static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, + const MachineMemOperand *A, + const MachineMemOperand *B) { + unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); + unsigned Size = A->getSize() + B->getSize(); + return MF.getMachineMemOperand(A, MinOffset, Size); +} + bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) { // XXX - Would the same offset be OK? Is there any reason this would happen or // be useful? @@ -858,12 +868,20 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { unsigned DestReg = MRI->createVirtualRegister(SuperRC); unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1); + // It shouldn't be possible to get this far if the two instructions + // don't have a single memoperand, because MachineInstr::mayAlias() + // will return true if this is the case. + assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); + BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) .addImm(MergedOffset) // offset .addImm(CI.GLC0) // glc .addImm(CI.DLC0) // dlc - .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair SubRegIdx = getSubRegIdxs(CI); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -909,6 +927,14 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { if (Regs & VADDR) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + // It shouldn't be possible to get this far if the two instructions + // don't have a single memoperand, because MachineInstr::mayAlias() + // will return true if this is the case. + assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) .addImm(MergedOffset) // offset @@ -916,7 +942,7 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { .addImm(CI.SLC0) // slc .addImm(0) // tfe .addImm(CI.DLC0) // dlc - .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair SubRegIdx = getSubRegIdxs(CI); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1092,6 +1118,15 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { if (Regs & VADDR) MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); + + // It shouldn't be possible to get this far if the two instructions + // don't have a single memoperand, because MachineInstr::mayAlias() + // will return true if this is the case. + assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand()); + + const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); + const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) .addImm(std::min(CI.Offset0, CI.Offset1)) // offset @@ -1099,7 +1134,7 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { .addImm(CI.SLC0) // slc .addImm(0) // tfe .addImm(CI.DLC0) // dlc - .cloneMergedMemRefs({&*CI.I, &*CI.Paired}); + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); moveInstsAfter(MIB, CI.InstsToMove); diff --git a/test/CodeGen/AMDGPU/merge-load-store.mir b/test/CodeGen/AMDGPU/merge-load-store.mir index f716cb15f0e..18b4b04858b 100644 --- a/test/CodeGen/AMDGPU/merge-load-store.mir +++ b/test/CodeGen/AMDGPU/merge-load-store.mir @@ -64,6 +64,8 @@ } attributes #0 = { convergent nounwind } + + define amdgpu_kernel void @merge_mmos() { ret void } ... --- name: mem_dependency @@ -163,3 +165,24 @@ body: | %18:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %9, 3, 0, 0 :: (dereferenceable invariant load 4) S_ENDPGM 0 ... +--- +# CHECK-LABEL: merge_mmos +# CHECK: S_BUFFER_LOAD_DWORDX2_IMM %0, 0, 0, 0 :: (dereferenceable invariant load 8, align 4) +# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 4) +# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 4) +name: merge_mmos +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + %0:sreg_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0, 0, 0, 0 :: (dereferenceable invariant load 4) + %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0, 1, 0, 0 :: (dereferenceable invariant load 4) + %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) + %4:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 4, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) + BUFFER_STORE_DWORD_OFFSET_exact %3, %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) + BUFFER_STORE_DWORD_OFFSET_exact %4, %0, 0, 4, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) + S_ENDPGM 0 + +... diff --git a/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir b/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir index 574aea17319..e3bcac22f13 100644 --- a/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir +++ b/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir @@ -32,7 +32,7 @@ } ... -# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) --- name: test1 liveins: @@ -124,7 +124,7 @@ body: | S_ENDPGM 0 ... -# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) --- name: test4 liveins: -- 2.40.0