AMDGPUSubtarget(TT, GPU, FS, TM),
InstrInfo(*this),
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
- TLInfo(TM, *this) {
-#ifndef LLVM_BUILD_GLOBAL_ISEL
- GISelAccessor *GISel = new GISelAccessor();
-#else
- AMDGPUGISelActualAccessor *GISel =
- new AMDGPUGISelActualAccessor();
- GISel->CallLoweringInfo.reset(
- new AMDGPUCallLowering(*getTargetLowering()));
-#endif
- setGISelAccessor(*GISel);
-}
+ TLInfo(TM, *this) {}
unsigned R600Subtarget::getStackEntrySize() const {
switch (getWavefrontSize()) {
AMDGPUTargetMachine::~AMDGPUTargetMachine() { }
+StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
+ Attribute GPUAttr = F.getFnAttribute("target-cpu");
+ return GPUAttr.hasAttribute(Attribute::None) ?
+ getTargetCPU() : GPUAttr.getValueAsString();
+}
+
+StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
+ Attribute FSAttr = F.getFnAttribute("target-features");
+
+ return FSAttr.hasAttribute(Attribute::None) ?
+ getTargetFeatureString() :
+ FSAttr.getValueAsString();
+}
+
//===----------------------------------------------------------------------===//
// R600 Target Machine (R600 -> Cayman)
//===----------------------------------------------------------------------===//
TargetOptions Options,
Optional<Reloc::Model> RM,
CodeModel::Model CM, CodeGenOpt::Level OL)
- : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
- Subtarget(TT, getTargetCPU(), FS, *this) {}
+ : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+
+const R600Subtarget *R600TargetMachine::getSubtargetImpl(
+ const Function &F) const {
+ StringRef GPU = getGPUName(F);
+ StringRef FS = getFeatureString(F);
+
+ SmallString<128> SubtargetKey(GPU);
+ SubtargetKey.append(FS);
+
+ auto &I = SubtargetMap[SubtargetKey];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
+ }
+
+ return I.get();
+}
//===----------------------------------------------------------------------===//
// GCN Target Machine (SI+)
TargetOptions Options,
Optional<Reloc::Model> RM,
CodeModel::Model CM, CodeGenOpt::Level OL)
- : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
- Subtarget(TT, getTargetCPU(), FS, *this) {}
+ : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+
+const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
+ StringRef GPU = getGPUName(F);
+ StringRef FS = getFeatureString(F);
+
+ SmallString<128> SubtargetKey(GPU);
+ SubtargetKey.append(FS);
+
+ auto &I = SubtargetMap[SubtargetKey];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+ GISelAccessor *GISel = new GISelAccessor();
+#else
+ SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
+#endif
+
+ I->setGISelAccessor(*GISel);
+ }
+
+ return I.get();
+}
//===----------------------------------------------------------------------===//
// AMDGPU Pass Setup
TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
return TargetIRAnalysis([this](const Function &F) {
- return TargetTransformInfo(
- AMDGPUTTIImpl(this, F.getParent()->getDataLayout()));
+ return TargetTransformInfo(AMDGPUTTIImpl(this, F));
});
}
std::unique_ptr<TargetLoweringObjectFile> TLOF;
AMDGPUIntrinsicInfo IntrinsicInfo;
+ StringRef getGPUName(const Function &F) const;
+ StringRef getFeatureString(const Function &F) const;
+
public:
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, TargetOptions Options,
class R600TargetMachine final : public AMDGPUTargetMachine {
private:
- R600Subtarget Subtarget;
+ mutable StringMap<std::unique_ptr<R600Subtarget>> SubtargetMap;
public:
R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
- const R600Subtarget *getSubtargetImpl() const {
- return &Subtarget;
- }
-
- const R600Subtarget *getSubtargetImpl(const Function &) const override {
- return &Subtarget;
- }
+ const R600Subtarget *getSubtargetImpl(const Function &) const override;
};
//===----------------------------------------------------------------------===//
class GCNTargetMachine final : public AMDGPUTargetMachine {
private:
- SISubtarget Subtarget;
+ mutable StringMap<std::unique_ptr<SISubtarget>> SubtargetMap;
public:
GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
- const SISubtarget *getSubtargetImpl() const {
- return &Subtarget;
- }
-
- const SISubtarget *getSubtargetImpl(const Function &) const override {
- return &Subtarget;
- }
+ const SISubtarget *getSubtargetImpl(const Function &) const override;
};
-inline const AMDGPUSubtarget *AMDGPUTargetMachine::getSubtargetImpl() const {
- if (getTargetTriple().getArch() == Triple::amdgcn)
- return static_cast<const GCNTargetMachine *>(this)->getSubtargetImpl();
- return static_cast<const R600TargetMachine *>(this)->getSubtargetImpl();
-}
-
inline const AMDGPUSubtarget *AMDGPUTargetMachine::getSubtargetImpl(
const Function &F) const {
if (getTargetTriple().getArch() == Triple::amdgcn)
}
public:
- explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const DataLayout &DL)
- : BaseT(TM, DL), ST(TM->getSubtargetImpl()),
- TLI(ST->getTargetLowering()) {}
+ explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+ : BaseT(TM, F.getParent()->getDataLayout()),
+ ST(TM->getSubtargetImpl(F)),
+ TLI(ST->getTargetLowering()) {}
// Provide value semantics. MSVC requires that we spell all of these out.
AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg)
--- /dev/null
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #1
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+; CI+ intrinsic
+declare void @llvm.amdgcn.s.dcache.inv.vol() #0
+
+; VI+ intrinsic
+declare void @llvm.amdgcn.s.dcache.wb() #0
+
+; CHECK-LABEL: {{^}}target_none:
+; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
+; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+define void @target_none() #0 {
+ %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+ %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
+ %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
+ %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.ext = sext i32 %id to i64
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
+ store i32 0, i32 addrspace(1)* %gep
+ ret void
+}
+
+; CHECK-LABEL: {{^}}target_tahiti:
+; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
+; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+define void @target_tahiti() #1 {
+ %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+ %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
+ %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
+ %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.ext = sext i32 %id to i64
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
+ store i32 0, i32 addrspace(1)* %gep
+ ret void
+}
+
+; CHECK-LABEL: {{^}}target_bonaire:
+; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+; CHECK: s_dcache_inv_vol
+define void @target_bonaire() #3 {
+ %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+ %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
+ %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
+ %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.ext = sext i32 %id to i64
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
+ store i32 0, i32 addrspace(1)* %gep
+ call void @llvm.amdgcn.s.dcache.inv.vol()
+ ret void
+}
+
+; CHECK-LABEL: {{^}}target_fiji:
+; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x400
+; CHECK: flat_store_dword
+; CHECK: s_dcache_wb{{$}}
+define void @target_fiji() #4 {
+ %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+ %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
+ %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
+ %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %id.ext = sext i32 %id to i64
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
+ store i32 0, i32 addrspace(1)* %gep
+ call void @llvm.amdgcn.s.dcache.wb()
+ ret void
+}
+
+; CHECK-LABEL: {{^}}promote_alloca_enabled:
+; CHECK: ds_read_b32
+; CHECK: ; LDSByteSize: 5120
+define void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %tmp = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp
+ %load = load i32, i32* %arrayidx1
+ store i32 %load, i32 addrspace(1)* %out
+ ret void
+}
+
+; CHECK-LABEL: {{^}}promote_alloca_disabled:
+; CHECK: SCRATCH_RSRC_DWORD0
+; CHECK: SCRATCH_RSRC_DWORD1
+; CHECK: ScratchSize: 24
+define void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 {
+entry:
+ %stack = alloca [5 x i32], align 4
+ %tmp = load i32, i32 addrspace(1)* %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp
+ %load = load i32, i32* %arrayidx1
+ store i32 %load, i32 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "target-cpu"="tahiti" }
+attributes #3 = { nounwind "target-cpu"="bonaire" }
+attributes #4 = { nounwind "target-cpu"="fiji" }
+attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-max-waves-per-eu"="3" }
+attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-max-waves-per-eu"="3" }