AMDGPU: Use MachineInstr::mayAlias to replace areMemAccessesTriviallyDisjoint in...

author Changpeng Fang <changpeng.fang@gmail.com>

Mon, 18 Feb 2019 23:00:26 +0000 (23:00 +0000)

committer Changpeng Fang <changpeng.fang@gmail.com>

Mon, 18 Feb 2019 23:00:26 +0000 (23:00 +0000)
author Changpeng Fang <changpeng.fang@gmail.com>
Mon, 18 Feb 2019 23:00:26 +0000 (23:00 +0000)
committer Changpeng Fang <changpeng.fang@gmail.com>
Mon, 18 Feb 2019 23:00:26 +0000 (23:00 +0000)
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp

index 0e3048792aa9933eabdecf8af7dbe3e9510e88d5..8abf91b8ae2b929ab3fce8d282b831225a553072 100644 (file)
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2215,17 +2215,6 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
    if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
      return false;
  
-  if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
-    const MachineMemOperand *MMOa = *MIa.memoperands_begin();
-    const MachineMemOperand *MMOb = *MIb.memoperands_begin();
-    if (MMOa->getValue() && MMOb->getValue()) {
-      MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
-      MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
-      if (!AA->alias(LocA, LocB))
-        return true;
-    }
-  }
-
    // TODO: Should we check the address space from the MachineMemOperand? That
    // would allow us to distinguish objects we know don't alias based on the
    // underlying address space, even if it was lowered to a different one,
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

index 6fc587bf0d642e8eec27685aedc72ad384d413da..f6bb7b3196fc562ad859e9522e7db4e1037224af 100644 (file)
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -256,13 +256,11 @@ static void addDefsUsesToList(const MachineInstr &MI,
  
  static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
                                        MachineBasicBlock::iterator B,
-                                      const SIInstrInfo *TII,
                                        AliasAnalysis *AA) {
    // RAW or WAR - cannot reorder
    // WAW - cannot reorder
    // RAR - safe to reorder
-  return !(A->mayStore() || B->mayStore()) ||
-         TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
+  return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
  }
  
  // Add MI and its defs to the lists if MI reads one of the defs that are
@@ -294,13 +292,13 @@ static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
  
  static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
                                      ArrayRef<MachineInstr *> InstsToMove,
-                                    const SIInstrInfo *TII, AliasAnalysis *AA) {
+                                    AliasAnalysis *AA) {
    assert(MemOp.mayLoadOrStore());
  
    for (MachineInstr *InstToMove : InstsToMove) {
      if (!InstToMove->mayLoadOrStore())
        continue;
-    if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
+    if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
        return false;
    }
    return true;
@@ -566,8 +564,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
        }
  
        if (MBBI->mayLoadOrStore() &&
-          (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
-           !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
+          (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
+           !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
          // We fail condition #1, but we may still be able to satisfy condition
          // #2.  Add this instruction to the move list and then we will check
          // if condition #2 holds once we have selected the matching instruction.
@@ -646,7 +644,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
        // move and make sure they are all safe to move down past the merged
        // instruction.
        if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
-        if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
+        if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
            return true;
      }
  
@@ -655,8 +653,8 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
      // it was safe to move I and also all the instruction in InstsToMove
      // down past this instruction.
      // check if we can move I across MBBI and if we can move all I's users
-    if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
-        !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
+    if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
+        !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
        break;
    }
    return false;
diff --git a/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll b/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll

new file mode 100644 (file)

index 0000000..06fa048
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
@@ -0,0 +1,130 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+
+
+; There is no dependence between the store and the two loads. So we can combine the loads
+; and the combined load is at the original place of the second load.
+
+; GCN-LABEL: {{^}}ds_combine_nodep
+
+; GCN:      ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
+; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8
+define amdgpu_kernel void @ds_combine_nodep(float addrspace(1)* %out, float addrspace(3)* %inptr) {
+
+  %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
+  %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24
+  %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
+  %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
+  %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
+  %v0 = extractelement <3 x float> %load0, i32 2
+
+  %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
+  %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
+
+  %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
+  %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
+  store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
+
+  %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7
+  %v1 = load float, float addrspace(3)* %vaddr1, align 4
+
+  %sum = fadd float %v0, %v1
+  store float %sum, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; The store depends on the first load, so we could not move the first load down to combine with
+; the second load directly. However, we can move the store after the combined load.
+
+; GCN-LABEL: {{^}}ds_combine_WAR
+
+; GCN:      ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:27
+; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
+define amdgpu_kernel void @ds_combine_WAR(float addrspace(1)* %out, float addrspace(3)* %inptr) {
+
+  %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
+  %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100
+  %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
+  %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
+  %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
+  %v0 = extractelement <3 x float> %load0, i32 2
+
+  %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
+  %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
+
+  %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
+  %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
+  store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
+
+  %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 7
+  %v1 = load float, float addrspace(3)* %vaddr1, align 4
+
+  %sum = fadd float %v0, %v1
+  store float %sum, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; The second load depends on the store. We can combine the two loads, and the combined load is
+; at the original place of the second load.
+
+; GCN-LABEL: {{^}}ds_combine_RAW
+
+; GCN:      ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
+; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:26
+define amdgpu_kernel void @ds_combine_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) {
+
+  %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
+  %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 24
+  %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
+  %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
+  %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
+  %v0 = extractelement <3 x float> %load0, i32 2
+
+  %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
+  %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
+
+  %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
+  %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
+  store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
+
+  %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26
+  %v1 = load float, float addrspace(3)* %vaddr1, align 4
+
+  %sum = fadd float %v0, %v1
+  store float %sum, float addrspace(1)* %out, align 4
+  ret void
+}
+
+
+; The store depends on the first load, also the second load depends on the store.
+; So we can not combine the two loads.
+
+; GCN-LABEL: {{^}}ds_combine_WAR_RAW
+
+; GCN:      ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:108
+; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
+; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104
+define amdgpu_kernel void @ds_combine_WAR_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) {
+
+  %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
+  %addr0 = getelementptr i8, i8 addrspace(3)* %base, i32 100
+  %tmp0 = bitcast i8 addrspace(3)* %addr0 to float addrspace(3)*
+  %vaddr0 = bitcast float addrspace(3)* %tmp0 to <3 x float> addrspace(3)*
+  %load0 = load <3 x float>, <3 x float> addrspace(3)* %vaddr0, align 4
+  %v0 = extractelement <3 x float> %load0, i32 2
+
+  %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
+  %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
+
+  %tmp2 = getelementptr float, float addrspace(3)* %inptr, i32 26
+  %vaddrs = bitcast float addrspace(3)* %tmp2 to <2 x float> addrspace(3)*
+  store <2 x float> %data, <2 x float> addrspace(3)* %vaddrs, align 4
+
+  %vaddr1 = getelementptr float, float addrspace(3)* %inptr, i32 26
+  %v1 = load float, float addrspace(3)* %vaddr1, align 4
+
+  %sum = fadd float %v0, %v1
+  store float %sum, float addrspace(1)* %out, align 4
+  ret void
+}
author	Changpeng Fang <changpeng.fang@gmail.com>
	Mon, 18 Feb 2019 23:00:26 +0000 (23:00 +0000)
committer	Changpeng Fang <changpeng.fang@gmail.com>
	Mon, 18 Feb 2019 23:00:26 +0000 (23:00 +0000)
lib/Target/AMDGPU/SIInstrInfo.cpp		patch \| blob \| history
lib/Target/AMDGPU/SILoadStoreOptimizer.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/ds-combine-with-dependence.ll	[new file with mode: 0644]	patch \| blob