[AMDGPU][CodeGen] To improve CGEMM performance: combine LDS reads.

author Alexander Timofeev <Alexander.Timofeev@amd.com>

Thu, 3 Nov 2016 14:37:13 +0000 (14:37 +0000)

committer Alexander Timofeev <Alexander.Timofeev@amd.com>

Thu, 3 Nov 2016 14:37:13 +0000 (14:37 +0000)
author Alexander Timofeev <Alexander.Timofeev@amd.com>
Thu, 3 Nov 2016 14:37:13 +0000 (14:37 +0000)
committer Alexander Timofeev <Alexander.Timofeev@amd.com>
Thu, 3 Nov 2016 14:37:13 +0000 (14:37 +0000)
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

index 6915191665f68b8addf7f3d6706d5e608f2e5aff..99fe96c0be228e42b307be3d0ff41ce24e2fe007 100644 (file)
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -141,6 +141,18 @@ static void addDefsToList(const MachineInstr &MI,
    }
  }
  
+static bool memAccessesCanBeReordered(
+  MachineBasicBlock::iterator A,
+  MachineBasicBlock::iterator B,
+  const SIInstrInfo *TII,
+  llvm::AliasAnalysis * AA) {
+  return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) ||
+    // RAW or WAR - cannot reorder
+    // WAW - cannot reorder
+    // RAR - safe to reorder
+    !(A->mayStore() || B->mayStore()));
+}
+
  // Add MI and its defs to the lists if MI reads one of the defs that are
  // already in the list. Returns true in that case.
  static bool
@@ -173,8 +185,8 @@ canMoveInstsAcrossMemOp(MachineInstr &MemOp,
    for (MachineInstr *InstToMove : InstsToMove) {
      if (!InstToMove->mayLoadOrStore())
        continue;
-    if (!TII->areMemAccessesTriviallyDisjoint(MemOp, *InstToMove, AA))
-      return false;
+    if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
+        return false;
    }
    return true;
  }
@@ -233,7 +245,7 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
          return E;
  
        if (MBBI->mayLoadOrStore() &&
-          !TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA)) {
+        !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) {
          // We fail condition #1, but we may still be able to satisfy condition
          // #2.  Add this instruction to the move list and then we will check
          // if condition #2 holds once we have selected the matching instruction.
@@ -288,8 +300,10 @@ SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
      // We could potentially keep looking, but we'd need to make sure that
      // it was safe to move I and also all the instruction in InstsToMove
      // down past this instruction.
-    // FIXME: This is too conservative.
-    break;
+    if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) ||   // check if we can move I across MBBI
+      !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users
+     )
+      break;
    }
    return E;
  }
diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll

index 6e30cff9609d30d5e08232ba0e815606c8522827..9a313230e3035bda4c1717ccefd6775b8d06b988 100644 (file)
--- a/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/test/CodeGen/AMDGPU/ds_read2.ll
@@ -493,6 +493,46 @@ define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in)
    ret void
  }
  
+; SI-LABEL: ds_read_diff_base_interleaving
+; SI-NOT: ds_read_b32
+define amdgpu_kernel void @ds_read_diff_base_interleaving(
+  float addrspace(1)* nocapture %arg,
+  [4 x [4 x float]] addrspace(3)* %arg1,
+  [4 x [4 x float]] addrspace(3)* %arg2,
+  [4 x [4 x float]] addrspace(3)* %arg3,
+  [4 x [4 x float]] addrspace(3)* %arg4) #1 {
+bb:
+  %tmp = getelementptr float, float addrspace(1)* %arg, i64 10
+  %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+  %tmp6 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
+  %tmp7 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 0
+  %tmp8 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 0, i32 %tmp5
+  %tmp9 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 0
+  %tmp10 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 0, i32 %tmp5
+  %tmp11 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 1
+  %tmp12 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 1, i32 %tmp5
+  %tmp13 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 1
+  %tmp14 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 1, i32 %tmp5
+  %tmp15 = load float, float addrspace(3)* %tmp7
+  %tmp16 = load float, float addrspace(3)* %tmp8
+  %tmp17 = fmul float %tmp15, %tmp16
+  %tmp18 = fadd float 2.000000e+00, %tmp17
+  %tmp19 = load float, float addrspace(3)* %tmp9
+  %tmp20 = load float, float addrspace(3)* %tmp10
+  %tmp21 = fmul float %tmp19, %tmp20
+  %tmp22 = fsub float %tmp18, %tmp21
+  %tmp23 = load float, float addrspace(3)* %tmp11
+  %tmp24 = load float, float addrspace(3)* %tmp12
+  %tmp25 = fmul float %tmp23, %tmp24
+  %tmp26 = fsub float %tmp22, %tmp25
+  %tmp27 = load float, float addrspace(3)* %tmp13
+  %tmp28 = load float, float addrspace(3)* %tmp14
+  %tmp29 = fmul float %tmp27, %tmp28
+  %tmp30 = fsub float %tmp26, %tmp29
+  store float %tmp30, float addrspace(1)* %tmp
+  ret void
+}
+
  ; Function Attrs: nounwind readnone
  declare i32 @llvm.amdgcn.workgroup.id.x() #1
author	Alexander Timofeev <Alexander.Timofeev@amd.com>
	Thu, 3 Nov 2016 14:37:13 +0000 (14:37 +0000)
committer	Alexander Timofeev <Alexander.Timofeev@amd.com>
	Thu, 3 Nov 2016 14:37:13 +0000 (14:37 +0000)
lib/Target/AMDGPU/SILoadStoreOptimizer.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/ds_read2.ll		patch \| blob \| history