static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
MachineBasicBlock::iterator B,
- const SIInstrInfo *TII,
AliasAnalysis *AA) {
// RAW or WAR - cannot reorder
// WAW - cannot reorder
// RAR - safe to reorder
- return !(A->mayStore() || B->mayStore()) ||
- TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
+ return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
}
// Add MI and its defs to the lists if MI reads one of the defs that are
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
ArrayRef<MachineInstr *> InstsToMove,
- const SIInstrInfo *TII, AliasAnalysis *AA) {
+ AliasAnalysis *AA) {
assert(MemOp.mayLoadOrStore());
for (MachineInstr *InstToMove : InstsToMove) {
if (!InstToMove->mayLoadOrStore())
continue;
- if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
+ if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
return false;
}
return true;
}
if (MBBI->mayLoadOrStore() &&
- (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
- !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
+ (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
+ !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
// We fail condition #1, but we may still be able to satisfy condition
// #2. Add this instruction to the move list and then we will check
// if condition #2 holds once we have selected the matching instruction.
// move and make sure they are all safe to move down past the merged
// instruction.
if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
- if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
+ if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
return true;
}
// it was safe to move I and also all the instruction in InstsToMove
// down past this instruction.
// check if we can move I across MBBI and if we can move all I's users
- if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
- !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
+ if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
+ !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
break;
}
return false;
--- /dev/null
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+
+; There is no dependence between the store and the two loads. So we can combine the loads
+; and the combined load is at the original place of the second load.
+
+; GCN-LABEL: {{^}}ds_combine_nodep
+
+; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
+; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8
+define amdgpu_kernel void @ds_combine_nodep(float addrspace(1)* %out, float addrspace(3)* %inptr) {
+
+ %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
+ %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24
+ %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
+ %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
+ %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
+ %v0 = extractelement <3 x float> %load0, i32 2
+
+ %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
+ %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
+
+ %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
+ %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
+ store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
+
+ %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7
+ %v1 = load float, float addrspace(3)* %vaddr1, align 4
+
+ %sum = fadd float %v0, %v1
+ store float %sum, float addrspace(1)* %out, align 4
+ ret void
+}
+
+
+; The store depends on the first load, so we could not move the first load down to combine with
+; the second load directly. However, we can move the store after the combined load.
+
+; GCN-LABEL: {{^}}ds_combine_WAR
+
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:27
+; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
+define amdgpu_kernel void @ds_combine_WAR(float addrspace(1)* %out, float addrspace(3)* %inptr) {
+
+ %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
+ %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100
+ %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
+ %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
+ %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
+ %v0 = extractelement <3 x float> %load0, i32 2
+
+ %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
+ %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
+
+ %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
+ %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
+ store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
+
+ %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7
+ %v1 = load float, float addrspace(3)* %vaddr1, align 4
+
+ %sum = fadd float %v0, %v1
+ store float %sum, float addrspace(1)* %out, align 4
+ ret void
+}
+
+
+; The second load depends on the store. We can combine the two loads, and the combined load is
+; at the original place of the second load.
+
+; GCN-LABEL: {{^}}ds_combine_RAW
+
+; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
+; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:26
+define amdgpu_kernel void @ds_combine_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) {
+
+ %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
+ %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24
+ %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
+ %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
+ %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
+ %v0 = extractelement <3 x float> %load0, i32 2
+
+ %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
+ %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
+
+ %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
+ %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
+ store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
+
+ %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26
+ %v1 = load float, float addrspace(3)* %vaddr1, align 4
+
+ %sum = fadd float %v0, %v1
+ store float %sum, float addrspace(1)* %out, align 4
+ ret void
+}
+
+
+; The store depends on the first load, also the second load depends on the store.
+; So we can not combine the two loads.
+
+; GCN-LABEL: {{^}}ds_combine_WAR_RAW
+
+; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:108
+; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
+; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104
+define amdgpu_kernel void @ds_combine_WAR_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) {
+
+ %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
+ %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100
+ %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
+ %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
+ %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
+ %v0 = extractelement <3 x float> %load0, i32 2
+
+ %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
+ %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
+
+ %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
+ %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
+ store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
+
+ %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26
+ %v1 = load float, float addrspace(3)* %vaddr1, align 4
+
+ %sum = fadd float %v0, %v1
+ store float %sum, float addrspace(1)* %out, align 4
+ ret void
+}