AMDGPU/SI: Move the local memory usage related checking after calling convention...

author Changpeng Fang <changpeng.fang@gmail.com>

Tue, 23 May 2017 20:25:41 +0000 (20:25 +0000)

committer Changpeng Fang <changpeng.fang@gmail.com>

Tue, 23 May 2017 20:25:41 +0000 (20:25 +0000)
author Changpeng Fang <changpeng.fang@gmail.com>
Tue, 23 May 2017 20:25:41 +0000 (20:25 +0000)
committer Changpeng Fang <changpeng.fang@gmail.com>
Tue, 23 May 2017 20:25:41 +0000 (20:25 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

index 85184b363905e0381587ba692b590a944f8e74d1..07f92918a43fe018ce772ed053aa0629900879fb 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -97,6 +97,9 @@ private:
                                         Instruction *UseInst,
                                         int OpIdx0, int OpIdx1) const;
  
+  /// Check whether we have enough local memory for promotion.
+  bool hasSufficientLocalMem(const Function &F);
+
  public:
    static char ID;
  
@@ -107,7 +110,7 @@ public:
  
    StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
  
-  void handleAlloca(AllocaInst &I);
+  bool handleAlloca(AllocaInst &I, bool SufficientLDS);
  
    void getAnalysisUsage(AnalysisUsage &AU) const override {
      AU.setPreservesCFG();
@@ -147,105 +150,21 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
    const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
    if (!ST.isPromoteAllocaEnabled())
      return false;
-  AS = AMDGPU::getAMDGPUAS(*F.getParent());
-
-  FunctionType *FTy = F.getFunctionType();
-
-  // If the function has any arguments in the local address space, then it's
-  // possible these arguments require the entire local memory space, so
-  // we cannot use local memory in the pass.
-  for (Type *ParamTy : FTy->params()) {
-    PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
-    if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
-      LocalMemLimit = 0;
-      DEBUG(dbgs() << "Function has local memory argument. Promoting to "
-                      "local memory disabled.\n");
-      return false;
-    }
-  }
-
-  LocalMemLimit = ST.getLocalMemorySize();
-  if (LocalMemLimit == 0)
-    return false;
-
-  const DataLayout &DL = Mod->getDataLayout();
-
-  // Check how much local memory is being used by global objects
-  CurrentLocalMemUsage = 0;
-  for (GlobalVariable &GV : Mod->globals()) {
-    if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
-      continue;
-
-    for (const User *U : GV.users()) {
-      const Instruction *Use = dyn_cast<Instruction>(U);
-      if (!Use)
-        continue;
-
-      if (Use->getParent()->getParent() == &F) {
-        unsigned Align = GV.getAlignment();
-        if (Align == 0)
-          Align = DL.getABITypeAlignment(GV.getValueType());
  
-        // FIXME: Try to account for padding here. The padding is currently
-        // determined from the inverse order of uses in the function. I'm not
-        // sure if the use list order is in any way connected to this, so the
-        // total reported size is likely incorrect.
-        uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
-        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
-        CurrentLocalMemUsage += AllocSize;
-        break;
-      }
-    }
-  }
-
-  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
-                                                          F);
-
-  // Restrict local memory usage so that we don't drastically reduce occupancy,
-  // unless it is already significantly reduced.
-
-  // TODO: Have some sort of hint or other heuristics to guess occupancy based
-  // on other factors..
-  unsigned OccupancyHint = ST.getWavesPerEU(F).second;
-  if (OccupancyHint == 0)
-    OccupancyHint = 7;
-
-  // Clamp to max value.
-  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
-
-  // Check the hint but ignore it if it's obviously wrong from the existing LDS
-  // usage.
-  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
-
-
-  // Round up to the next tier of usage.
-  unsigned MaxSizeWithWaveCount
-    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
-
-  // Program is possibly broken by using more local mem than available.
-  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
-    return false;
-
-  LocalMemLimit = MaxSizeWithWaveCount;
-
-  DEBUG(
-    dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
-    << "  Rounding size to " << MaxSizeWithWaveCount
-    << " with a maximum occupancy of " << MaxOccupancy << '\n'
-    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
-    << " available for promotion\n"
-  );
+  AS = AMDGPU::getAMDGPUAS(*F.getParent());
  
+  bool SufficientLDS = hasSufficientLocalMem(F);
+  bool Changed = false;
    BasicBlock &EntryBB = *F.begin();
    for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
      AllocaInst *AI = dyn_cast<AllocaInst>(I);
  
      ++I;
      if (AI)
-      handleAlloca(*AI);
+      Changed |= handleAlloca(*AI, SufficientLDS);
    }
  
-  return true;
+  return Changed;
  }
  
  std::pair<Value *, Value *>
@@ -661,12 +580,105 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
    return true;
  }
  
+bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
+
+  FunctionType *FTy = F.getFunctionType();
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+
+  // If the function has any arguments in the local address space, then it's
+  // possible these arguments require the entire local memory space, so
+  // we cannot use local memory in the pass.
+  for (Type *ParamTy : FTy->params()) {
+    PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
+    if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
+      LocalMemLimit = 0;
+      DEBUG(dbgs() << "Function has local memory argument. Promoting to "
+                      "local memory disabled.\n");
+      return false;
+    }
+  }
+
+  LocalMemLimit = ST.getLocalMemorySize();
+  if (LocalMemLimit == 0)
+    return false;
+
+  const DataLayout &DL = Mod->getDataLayout();
+
+  // Check how much local memory is being used by global objects
+  CurrentLocalMemUsage = 0;
+  for (GlobalVariable &GV : Mod->globals()) {
+    if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
+      continue;
+
+    for (const User *U : GV.users()) {
+      const Instruction *Use = dyn_cast<Instruction>(U);
+      if (!Use)
+        continue;
+
+      if (Use->getParent()->getParent() == &F) {
+        unsigned Align = GV.getAlignment();
+        if (Align == 0)
+          Align = DL.getABITypeAlignment(GV.getValueType());
+
+        // FIXME: Try to account for padding here. The padding is currently
+        // determined from the inverse order of uses in the function. I'm not
+        // sure if the use list order is in any way connected to this, so the
+        // total reported size is likely incorrect.
+        uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
+        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
+        CurrentLocalMemUsage += AllocSize;
+        break;
+      }
+    }
+  }
+
+  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
+                                                          F);
+
+  // Restrict local memory usage so that we don't drastically reduce occupancy,
+  // unless it is already significantly reduced.
+
+  // TODO: Have some sort of hint or other heuristics to guess occupancy based
+  // on other factors..
+  unsigned OccupancyHint = ST.getWavesPerEU(F).second;
+  if (OccupancyHint == 0)
+    OccupancyHint = 7;
+
+  // Clamp to max value.
+  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
+
+  // Check the hint but ignore it if it's obviously wrong from the existing LDS
+  // usage.
+  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
+
+
+  // Round up to the next tier of usage.
+  unsigned MaxSizeWithWaveCount
+    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
+
+  // Program is possibly broken by using more local mem than available.
+  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
+    return false;
+
+  LocalMemLimit = MaxSizeWithWaveCount;
+
+  DEBUG(
+    dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
+    << "  Rounding size to " << MaxSizeWithWaveCount
+    << " with a maximum occupancy of " << MaxOccupancy << '\n'
+    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
+    << " available for promotion\n"
+  );
+
+  return true;
+}
+
  // FIXME: Should try to pick the most likely to be profitable allocas first.
-void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
+bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
    // Array allocations are probably not worth handling, since an allocation of
    // the array type is the canonical form.
    if (!I.isStaticAlloca() || I.isArrayAllocation())
-    return;
+    return false;
  
    IRBuilder<> Builder(&I);
  
@@ -675,10 +687,8 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
  
    DEBUG(dbgs() << "Trying to promote " << I << '\n');
  
-  if (tryPromoteAllocaToVector(&I, AS)) {
-    DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
-    return;
-  }
+  if (tryPromoteAllocaToVector(&I, AS))
+    return true; // Promoted to vector.
  
    const Function &ContainingFunction = *I.getParent()->getParent();
    CallingConv::ID CC = ContainingFunction.getCallingConv();
@@ -692,9 +702,13 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
      break;
    default:
      DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n");
-    return;
+    return false;
    }
  
+  // Not likely to have sufficient local memory for promotion.
+  if (!SufficientLDS)
+    return false;
+
    const AMDGPUSubtarget &ST =
      TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
    unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
@@ -718,7 +732,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
    if (NewSize > LocalMemLimit) {
      DEBUG(dbgs() << "  " << AllocSize
            << " bytes of local memory not available to promote\n");
-    return;
+    return false;
    }
  
    CurrentLocalMemUsage = NewSize;
@@ -727,7 +741,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
  
    if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
      DEBUG(dbgs() << " Do not know how to convert all uses\n");
-    return;
+    return false;
    }
  
    DEBUG(dbgs() << "Promoting alloca to local memory\n");
@@ -873,6 +887,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
        llvm_unreachable("Don't know how to promote alloca intrinsic use.");
      }
    }
+  return true;
  }
  
  FunctionPass *llvm::createAMDGPUPromoteAlloca() {
diff --git a/test/CodeGen/AMDGPU/vector-alloca.ll b/test/CodeGen/AMDGPU/vector-alloca.ll

index 03cf725601b79dd2801439ef6364afae6a5742e2..a0aac8c1d9ba527bacce4f5c543f1aa04526546e 100644 (file)
--- a/test/CodeGen/AMDGPU/vector-alloca.ll
+++ b/test/CodeGen/AMDGPU/vector-alloca.ll
@@ -138,3 +138,25 @@ entry:
    store float %tmp2, float addrspace(1)* %out
    ret void
  }
+
+; The pointer arguments in local address space should not affect promotion to vector.
+
+; OPT-LABEL: @vector_read_with_local_arg(
+; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
+; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @vector_read_with_local_arg(i32 addrspace(3)* %stopper, i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %tmp = alloca [4 x i32]
+  %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
+  store i32 0, i32* %x
+  store i32 1, i32* %y
+  store i32 2, i32* %z
+  store i32 3, i32* %w
+  %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index
+  %tmp2 = load i32, i32* %tmp1
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
author	Changpeng Fang <changpeng.fang@gmail.com>
	Tue, 23 May 2017 20:25:41 +0000 (20:25 +0000)
committer	Changpeng Fang <changpeng.fang@gmail.com>
	Tue, 23 May 2017 20:25:41 +0000 (20:25 +0000)
lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/vector-alloca.ll		patch \| blob \| history