Instruction *UseInst,
int OpIdx0, int OpIdx1) const;
+ /// Check whether we have enough local memory for promotion.
+ bool hasSufficientLocalMem(const Function &F);
+
public:
static char ID;
StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
- void handleAlloca(AllocaInst &I);
+ bool handleAlloca(AllocaInst &I, bool SufficientLDS);
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
if (!ST.isPromoteAllocaEnabled())
return false;
- AS = AMDGPU::getAMDGPUAS(*F.getParent());
-
- FunctionType *FTy = F.getFunctionType();
-
- // If the function has any arguments in the local address space, then it's
- // possible these arguments require the entire local memory space, so
- // we cannot use local memory in the pass.
- for (Type *ParamTy : FTy->params()) {
- PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
- if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
- LocalMemLimit = 0;
- DEBUG(dbgs() << "Function has local memory argument. Promoting to "
- "local memory disabled.\n");
- return false;
- }
- }
-
- LocalMemLimit = ST.getLocalMemorySize();
- if (LocalMemLimit == 0)
- return false;
-
- const DataLayout &DL = Mod->getDataLayout();
-
- // Check how much local memory is being used by global objects
- CurrentLocalMemUsage = 0;
- for (GlobalVariable &GV : Mod->globals()) {
- if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
- continue;
-
- for (const User *U : GV.users()) {
- const Instruction *Use = dyn_cast<Instruction>(U);
- if (!Use)
- continue;
-
- if (Use->getParent()->getParent() == &F) {
- unsigned Align = GV.getAlignment();
- if (Align == 0)
- Align = DL.getABITypeAlignment(GV.getValueType());
- // FIXME: Try to account for padding here. The padding is currently
- // determined from the inverse order of uses in the function. I'm not
- // sure if the use list order is in any way connected to this, so the
- // total reported size is likely incorrect.
- uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
- CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
- CurrentLocalMemUsage += AllocSize;
- break;
- }
- }
- }
-
- unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
- F);
-
- // Restrict local memory usage so that we don't drastically reduce occupancy,
- // unless it is already significantly reduced.
-
- // TODO: Have some sort of hint or other heuristics to guess occupancy based
- // on other factors..
- unsigned OccupancyHint = ST.getWavesPerEU(F).second;
- if (OccupancyHint == 0)
- OccupancyHint = 7;
-
- // Clamp to max value.
- OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
-
- // Check the hint but ignore it if it's obviously wrong from the existing LDS
- // usage.
- MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
-
-
- // Round up to the next tier of usage.
- unsigned MaxSizeWithWaveCount
- = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
-
- // Program is possibly broken by using more local mem than available.
- if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
- return false;
-
- LocalMemLimit = MaxSizeWithWaveCount;
-
- DEBUG(
- dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
- << " Rounding size to " << MaxSizeWithWaveCount
- << " with a maximum occupancy of " << MaxOccupancy << '\n'
- << " and " << (LocalMemLimit - CurrentLocalMemUsage)
- << " available for promotion\n"
- );
+ AS = AMDGPU::getAMDGPUAS(*F.getParent());
+ bool SufficientLDS = hasSufficientLocalMem(F);
+ bool Changed = false;
BasicBlock &EntryBB = *F.begin();
for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
AllocaInst *AI = dyn_cast<AllocaInst>(I);
++I;
if (AI)
- handleAlloca(*AI);
+ Changed |= handleAlloca(*AI, SufficientLDS);
}
- return true;
+ return Changed;
}
std::pair<Value *, Value *>
return true;
}
+bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
+
+ FunctionType *FTy = F.getFunctionType();
+ const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+
+ // If the function has any arguments in the local address space, then it's
+ // possible these arguments require the entire local memory space, so
+ // we cannot use local memory in the pass.
+ for (Type *ParamTy : FTy->params()) {
+ PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
+ if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
+ LocalMemLimit = 0;
+ DEBUG(dbgs() << "Function has local memory argument. Promoting to "
+ "local memory disabled.\n");
+ return false;
+ }
+ }
+
+ LocalMemLimit = ST.getLocalMemorySize();
+ if (LocalMemLimit == 0)
+ return false;
+
+ const DataLayout &DL = Mod->getDataLayout();
+
+ // Check how much local memory is being used by global objects
+ CurrentLocalMemUsage = 0;
+ for (GlobalVariable &GV : Mod->globals()) {
+ if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
+ continue;
+
+ for (const User *U : GV.users()) {
+ const Instruction *Use = dyn_cast<Instruction>(U);
+ if (!Use)
+ continue;
+
+ if (Use->getParent()->getParent() == &F) {
+ unsigned Align = GV.getAlignment();
+ if (Align == 0)
+ Align = DL.getABITypeAlignment(GV.getValueType());
+
+ // FIXME: Try to account for padding here. The padding is currently
+ // determined from the inverse order of uses in the function. I'm not
+ // sure if the use list order is in any way connected to this, so the
+ // total reported size is likely incorrect.
+ uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
+ CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
+ CurrentLocalMemUsage += AllocSize;
+ break;
+ }
+ }
+ }
+
+ unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
+ F);
+
+ // Restrict local memory usage so that we don't drastically reduce occupancy,
+ // unless it is already significantly reduced.
+
+ // TODO: Have some sort of hint or other heuristics to guess occupancy based
+ // on other factors..
+ unsigned OccupancyHint = ST.getWavesPerEU(F).second;
+ if (OccupancyHint == 0)
+ OccupancyHint = 7;
+
+ // Clamp to max value.
+ OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
+
+ // Check the hint but ignore it if it's obviously wrong from the existing LDS
+ // usage.
+ MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
+
+
+ // Round up to the next tier of usage.
+ unsigned MaxSizeWithWaveCount
+ = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
+
+ // Program is possibly broken by using more local mem than available.
+ if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
+ return false;
+
+ LocalMemLimit = MaxSizeWithWaveCount;
+
+ DEBUG(
+ dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
+ << " Rounding size to " << MaxSizeWithWaveCount
+ << " with a maximum occupancy of " << MaxOccupancy << '\n'
+ << " and " << (LocalMemLimit - CurrentLocalMemUsage)
+ << " available for promotion\n"
+ );
+
+ return true;
+}
+
// FIXME: Should try to pick the most likely to be profitable allocas first.
-void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
+bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
// Array allocations are probably not worth handling, since an allocation of
// the array type is the canonical form.
if (!I.isStaticAlloca() || I.isArrayAllocation())
- return;
+ return false;
IRBuilder<> Builder(&I);
DEBUG(dbgs() << "Trying to promote " << I << '\n');
- if (tryPromoteAllocaToVector(&I, AS)) {
- DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
- return;
- }
+ if (tryPromoteAllocaToVector(&I, AS))
+ return true; // Promoted to vector.
const Function &ContainingFunction = *I.getParent()->getParent();
CallingConv::ID CC = ContainingFunction.getCallingConv();
break;
default:
DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n");
- return;
+ return false;
}
+ // Not likely to have sufficient local memory for promotion.
+ if (!SufficientLDS)
+ return false;
+
const AMDGPUSubtarget &ST =
TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
if (NewSize > LocalMemLimit) {
DEBUG(dbgs() << " " << AllocSize
<< " bytes of local memory not available to promote\n");
- return;
+ return false;
}
CurrentLocalMemUsage = NewSize;
if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
DEBUG(dbgs() << " Do not know how to convert all uses\n");
- return;
+ return false;
}
DEBUG(dbgs() << "Promoting alloca to local memory\n");
llvm_unreachable("Don't know how to promote alloca intrinsic use.");
}
}
+ return true;
}
FunctionPass *llvm::createAMDGPUPromoteAlloca() {