/// or
/// DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
/// to TargetPassConfig::createMachineScheduler() to have an effect.
- virtual bool shouldClusterMemOps(MachineInstr &FirstLdSt,
- MachineInstr &SecondLdSt,
+ virtual bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1,
+ MachineInstr &SecondLdSt, unsigned BaseReg2,
unsigned NumLoads) const {
llvm_unreachable("target did not implement shouldClusterMemOps()");
}
std::sort(MemOpRecords.begin(), MemOpRecords.end());
unsigned ClusterLength = 1;
for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
- if (MemOpRecords[Idx].BaseReg != MemOpRecords[Idx+1].BaseReg) {
- ClusterLength = 1;
- continue;
- }
-
SUnit *SUa = MemOpRecords[Idx].SU;
SUnit *SUb = MemOpRecords[Idx+1].SU;
- if (TII->shouldClusterMemOps(*SUa->getInstr(), *SUb->getInstr(),
+ if (TII->shouldClusterMemOps(*SUa->getInstr(), MemOpRecords[Idx].BaseReg,
+ *SUb->getInstr(), MemOpRecords[Idx+1].BaseReg,
ClusterLength) &&
DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
///
/// Only called for LdSt for which getMemOpBaseRegImmOfs returns true.
bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
+ unsigned BaseReg1,
MachineInstr &SecondLdSt,
+ unsigned BaseReg2,
unsigned NumLoads) const {
+ if (BaseReg1 != BaseReg2)
+ return false;
+
// Only cluster up to a single pair.
if (NumLoads > 1)
return false;
bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width,
int64_t &MinOffset, int64_t &MaxOffset) const;
- bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
+ bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1,
+ MachineInstr &SecondLdSt, unsigned BaseReg2,
unsigned NumLoads) const override;
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
return false;
}
+static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1,
+ const MachineInstr &MI2, unsigned BaseReg2) {
+ if (BaseReg1 == BaseReg2)
+ return true;
+
+ if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
+ return false;
+
+ auto MO1 = *MI1.memoperands_begin();
+ auto MO2 = *MI2.memoperands_begin();
+ if (MO1->getAddrSpace() != MO2->getAddrSpace())
+ return false;
+
+ auto Base1 = MO1->getValue();
+ auto Base2 = MO2->getValue();
+ if (!Base1 || !Base2)
+ return false;
+ const MachineFunction &MF = *MI1.getParent()->getParent();
+ const DataLayout &DL = MF.getFunction()->getParent()->getDataLayout();
+ Base1 = GetUnderlyingObject(Base1, DL);
+ Base2 = GetUnderlyingObject(Base1, DL);
+
+ if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
+ return false;
+
+ return Base1 == Base2;
+}
+
bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
+ unsigned BaseReg1,
MachineInstr &SecondLdSt,
+ unsigned BaseReg2,
unsigned NumLoads) const {
+ if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2))
+ return false;
+
const MachineOperand *FirstDst = nullptr;
const MachineOperand *SecondDst = nullptr;
if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
(isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
(isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
+ const unsigned MaxGlobalLoadCluster = 6;
+ if (NumLoads > MaxGlobalLoadCluster)
+ return false;
+
FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
if (!FirstDst)
FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
int64_t &Offset,
const TargetRegisterInfo *TRI) const final;
- bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
+ bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1,
+ MachineInstr &SecondLdSt, unsigned BaseReg2,
unsigned NumLoads) const final;
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i32:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: buffer_store_dword [[SEXT]]
define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i64:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
}
; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
-; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]],
-; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]],
+; SI: buffer_load_dword [[VAL0:v[0-9]+]],
+; SI: buffer_load_dword [[VAL1:v[0-9]+]],
+; VI: flat_load_dword [[VAL1:v[0-9]+]],
+; VI: flat_load_dword [[VAL0:v[0-9]+]],
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
; SI: buffer_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
; SI: buffer_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]]
-; VI: flat_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
; VI: flat_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
+; VI: flat_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
}
; GCN-LABEL: {{^}}fadd_v2f16:
-; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; VI: flat_load_dword v[[B_V2_F16:[0-9]+]]
+; VI: flat_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
--- /dev/null
+# RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -run-pass machine-scheduler -o - %s | FileCheck -check-prefix=GCN %s
+
+# CGN-LABEL: name: flat_load_clustering
+# GCN: FLAT_LOAD_DWORD
+# GCN-NEXT: FLAT_LOAD_DWORD
+--- |
+ define amdgpu_kernel void @flat_load_clustering(i32 addrspace(1)* nocapture %arg, i32 addrspace(2)* nocapture readonly %arg1) {
+ bb:
+ %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %idxprom = sext i32 %tid to i64
+ %gep1 = getelementptr inbounds i32, i32 addrspace(2)* %arg1, i64 %idxprom
+ %load1 = load i32, i32 addrspace(2)* %gep1, align 4
+ %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %idxprom
+ %gep34 = getelementptr inbounds i32, i32 addrspace(2)* %gep1, i64 4
+ %load2 = load i32, i32 addrspace(2)* %gep34, align 4
+ %gep4 = getelementptr inbounds i32, i32 addrspace(1)* %gep2, i64 4
+ store i32 %load1, i32 addrspace(1)* %gep2, align 4
+ store i32 %load2, i32 addrspace(1)* %gep4, align 4
+ ret void
+ }
+
+ declare i32 @llvm.amdgcn.workitem.id.x()
+
+...
+---
+name: flat_load_clustering
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vgpr_32 }
+ - { id: 1, class: sgpr_64 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: sreg_64_xexec }
+ - { id: 4, class: sreg_64_xexec }
+ - { id: 5, class: vgpr_32 }
+ - { id: 6, class: vgpr_32 }
+ - { id: 7, class: vgpr_32 }
+ - { id: 8, class: vgpr_32 }
+ - { id: 9, class: vreg_64 }
+ - { id: 10, class: vreg_64 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vreg_64 }
+ - { id: 13, class: vreg_64 }
+liveins:
+ - { reg: '%vgpr0', virtual-reg: '%0' }
+ - { reg: '%sgpr4_sgpr5', virtual-reg: '%1' }
+body: |
+ bb.0.bb:
+ liveins: %vgpr0, %sgpr4_sgpr5
+
+ %1 = COPY %sgpr4_sgpr5
+ %0 = COPY %vgpr0
+ %3 = S_LOAD_DWORDX2_IMM %1, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %4 = S_LOAD_DWORDX2_IMM %1, 8, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %7 = V_LSHLREV_B32_e32 2, %0, implicit %exec
+ %2 = V_MOV_B32_e32 0, implicit %exec
+ undef %12.sub0 = V_ADD_I32_e32 %4.sub0, %7, implicit-def %vcc, implicit %exec
+ %11 = COPY %4.sub1
+ %12.sub1 = V_ADDC_U32_e32 %11, %2, implicit-def dead %vcc, implicit killed %vcc, implicit %exec
+ %5 = FLAT_LOAD_DWORD %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.gep1)
+ undef %9.sub0 = V_ADD_I32_e32 %3.sub0, %7, implicit-def %vcc, implicit %exec
+ %8 = COPY %3.sub1
+ %9.sub1 = V_ADDC_U32_e32 %8, %2, implicit-def dead %vcc, implicit killed %vcc, implicit %exec
+ undef %13.sub0 = V_ADD_I32_e32 16, %12.sub0, implicit-def %vcc, implicit %exec
+ %13.sub1 = V_ADDC_U32_e32 %12.sub1, %2, implicit-def dead %vcc, implicit killed %vcc, implicit %exec
+ %6 = FLAT_LOAD_DWORD %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.gep34)
+ undef %10.sub0 = V_ADD_I32_e32 16, %9.sub0, implicit-def %vcc, implicit %exec
+ %10.sub1 = V_ADDC_U32_e32 %9.sub1, %2, implicit-def dead %vcc, implicit killed %vcc, implicit %exec
+ FLAT_STORE_DWORD %9, %5, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.gep2)
+ FLAT_STORE_DWORD %10, %6, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.gep4)
+ S_ENDPGM
+
+...
; CI: buffer_store_dword
; CI: s_endpgm
-; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:400
-; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:408
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:12
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:400
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:408
+; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:400
+; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:408
+; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:12
+; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:400
+; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:408
; GFX9: global_store_dword
; GFX9: s_endpgm
define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i32:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: buffer_store_dword [[SEXT]]
define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i64:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
-; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}