AMDGPU: Implement per-function subtargets

author Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 27 Jun 2016 20:48:03 +0000 (20:48 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 27 Jun 2016 20:48:03 +0000 (20:48 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 27 Jun 2016 20:48:03 +0000 (20:48 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 27 Jun 2016 20:48:03 +0000 (20:48 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

index 39032b682e195c91a621650772be90fd4f17985e..d5dfe53356fb398d1d7d08e9e1be021215622139 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -202,17 +202,7 @@ SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS,
    AMDGPUSubtarget(TT, GPU, FS, TM),
    InstrInfo(*this),
    FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
-  TLInfo(TM, *this) {
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-  GISelAccessor *GISel = new GISelAccessor();
-#else
-  AMDGPUGISelActualAccessor *GISel =
-    new AMDGPUGISelActualAccessor();
-  GISel->CallLoweringInfo.reset(
-    new AMDGPUCallLowering(*getTargetLowering()));
-#endif
-  setGISelAccessor(*GISel);
-}
+  TLInfo(TM, *this) {}
  
  unsigned R600Subtarget::getStackEntrySize() const {
    switch (getWavefrontSize()) {
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

index 162bbc2f91cac5645f24393f254fbc007e56ae82..945f82c54f3b007f74deb55c2760f314ddb29da5 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -145,6 +145,20 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
  
  AMDGPUTargetMachine::~AMDGPUTargetMachine() { }
  
+StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
+  Attribute GPUAttr = F.getFnAttribute("target-cpu");
+  return GPUAttr.hasAttribute(Attribute::None) ?
+    getTargetCPU() : GPUAttr.getValueAsString();
+}
+
+StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  return FSAttr.hasAttribute(Attribute::None) ?
+    getTargetFeatureString() :
+    FSAttr.getValueAsString();
+}
+
  //===----------------------------------------------------------------------===//
  // R600 Target Machine (R600 -> Cayman)
  //===----------------------------------------------------------------------===//
@@ -154,8 +168,27 @@ R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
                                       TargetOptions Options,
                                       Optional<Reloc::Model> RM,
                                       CodeModel::Model CM, CodeGenOpt::Level OL)
-  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, getTargetCPU(), FS, *this) {}
+  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+
+const R600Subtarget *R600TargetMachine::getSubtargetImpl(
+  const Function &F) const {
+  StringRef GPU = getGPUName(F);
+  StringRef FS = getFeatureString(F);
+
+  SmallString<128> SubtargetKey(GPU);
+  SubtargetKey.append(FS);
+
+  auto &I = SubtargetMap[SubtargetKey];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
+  }
+
+  return I.get();
+}
  
  //===----------------------------------------------------------------------===//
  // GCN Target Machine (SI+)
@@ -166,8 +199,34 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
                                     TargetOptions Options,
                                     Optional<Reloc::Model> RM,
                                     CodeModel::Model CM, CodeGenOpt::Level OL)
-  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, getTargetCPU(), FS, *this) {}
+  : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
+
+const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
+  StringRef GPU = getGPUName(F);
+  StringRef FS = getFeatureString(F);
+
+  SmallString<128> SubtargetKey(GPU);
+  SubtargetKey.append(FS);
+
+  auto &I = SubtargetMap[SubtargetKey];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+    GISelAccessor *GISel = new GISelAccessor();
+#else
+    SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
+#endif
+
+    I->setGISelAccessor(*GISel);
+  }
+
+  return I.get();
+}
  
  //===----------------------------------------------------------------------===//
  // AMDGPU Pass Setup
@@ -244,8 +303,7 @@ public:
  
  TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
    return TargetIRAnalysis([this](const Function &F) {
-    return TargetTransformInfo(
-        AMDGPUTTIImpl(this, F.getParent()->getDataLayout()));
+    return TargetTransformInfo(AMDGPUTTIImpl(this, F));
    });
  }
  
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h

index 77dfd4fdc065aed221219a2c2541dfd04504b26d..b0eb3a9a15f78f86e4cc452b70ca6d1ac422a581 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -29,6 +29,9 @@ protected:
    std::unique_ptr<TargetLoweringObjectFile> TLOF;
    AMDGPUIntrinsicInfo IntrinsicInfo;
  
+  StringRef getGPUName(const Function &F) const;
+  StringRef getFeatureString(const Function &F) const;
+
  public:
    AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, TargetOptions Options,
@@ -55,7 +58,7 @@ public:
  
  class R600TargetMachine final : public AMDGPUTargetMachine {
  private:
-  R600Subtarget Subtarget;
+  mutable StringMap<std::unique_ptr<R600Subtarget>> SubtargetMap;
  
  public:
    R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
@@ -65,13 +68,7 @@ public:
  
    TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
  
-  const R600Subtarget *getSubtargetImpl() const {
-    return &Subtarget;
-  }
-
-  const R600Subtarget *getSubtargetImpl(const Function &) const override {
-    return &Subtarget;
-  }
+  const R600Subtarget *getSubtargetImpl(const Function &) const override;
  };
  
  //===----------------------------------------------------------------------===//
@@ -80,7 +77,7 @@ public:
  
  class GCNTargetMachine final : public AMDGPUTargetMachine {
  private:
-    SISubtarget Subtarget;
+  mutable StringMap<std::unique_ptr<SISubtarget>> SubtargetMap;
  
  public:
    GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
@@ -90,21 +87,9 @@ public:
  
    TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
  
-  const SISubtarget *getSubtargetImpl() const {
-    return &Subtarget;
-  }
-
-  const SISubtarget *getSubtargetImpl(const Function &) const override {
-    return &Subtarget;
-  }
+  const SISubtarget *getSubtargetImpl(const Function &) const override;
  };
  
-inline const AMDGPUSubtarget *AMDGPUTargetMachine::getSubtargetImpl() const {
-  if (getTargetTriple().getArch() == Triple::amdgcn)
-    return static_cast<const GCNTargetMachine *>(this)->getSubtargetImpl();
-  return static_cast<const R600TargetMachine *>(this)->getSubtargetImpl();
-}
-
  inline const AMDGPUSubtarget *AMDGPUTargetMachine::getSubtargetImpl(
    const Function &F) const {
    if (getTargetTriple().getArch() == Triple::amdgcn)
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

index 25187a280e3e1d7fcc20c640cdbe1c74ad26b8b7..ea4f5826f6bd93b96cf36a8060c648984c3294ed 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -59,9 +59,10 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
    }
  
  public:
-  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const DataLayout &DL)
-      : BaseT(TM, DL), ST(TM->getSubtargetImpl()),
-        TLI(ST->getTargetLowering()) {}
+  explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
+    : BaseT(TM, F.getParent()->getDataLayout()),
+      ST(TM->getSubtargetImpl(F)),
+      TLI(ST->getTargetLowering()) {}
  
    // Provide value semantics. MSVC requires that we spell all of these out.
    AMDGPUTTIImpl(const AMDGPUTTIImpl &Arg)
diff --git a/test/CodeGen/AMDGPU/target-cpu.ll b/test/CodeGen/AMDGPU/target-cpu.ll

new file mode 100644 (file)

index 0000000..c1662ac
--- /dev/null
+++ b/test/CodeGen/AMDGPU/target-cpu.ll
@@ -0,0 +1,112 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #1
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+; CI+ intrinsic
+declare void @llvm.amdgcn.s.dcache.inv.vol() #0
+
+; VI+ intrinsic
+declare void @llvm.amdgcn.s.dcache.wb() #0
+
+; CHECK-LABEL: {{^}}target_none:
+; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
+; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+define void @target_none() #0 {
+  %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
+  %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
+  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %id.ext = sext i32 %id to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
+  store i32 0, i32 addrspace(1)* %gep
+  ret void
+}
+
+; CHECK-LABEL: {{^}}target_tahiti:
+; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
+; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+define void @target_tahiti() #1 {
+  %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
+  %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
+  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %id.ext = sext i32 %id to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
+  store i32 0, i32 addrspace(1)* %gep
+  ret void
+}
+
+; CHECK-LABEL: {{^}}target_bonaire:
+; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100
+; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
+; CHECK: s_dcache_inv_vol
+define void @target_bonaire() #3 {
+  %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
+  %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
+  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %id.ext = sext i32 %id to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
+  store i32 0, i32 addrspace(1)* %gep
+  call void @llvm.amdgcn.s.dcache.inv.vol()
+  ret void
+}
+
+; CHECK-LABEL: {{^}}target_fiji:
+; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x400
+; CHECK: flat_store_dword
+; CHECK: s_dcache_wb{{$}}
+define void @target_fiji() #4 {
+  %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+  %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
+  %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
+  %ptr = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %kernargs.gep.cast
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %id.ext = sext i32 %id to i64
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
+  store i32 0, i32 addrspace(1)* %gep
+  call void @llvm.amdgcn.s.dcache.wb()
+  ret void
+}
+
+; CHECK-LABEL: {{^}}promote_alloca_enabled:
+; CHECK: ds_read_b32
+; CHECK: ; LDSByteSize: 5120
+define void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %tmp = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp
+  %load = load i32, i32* %arrayidx1
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: {{^}}promote_alloca_disabled:
+; CHECK: SCRATCH_RSRC_DWORD0
+; CHECK: SCRATCH_RSRC_DWORD1
+; CHECK: ScratchSize: 24
+define void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 {
+entry:
+  %stack = alloca [5 x i32], align 4
+  %tmp = load i32, i32 addrspace(1)* %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp
+  %load = load i32, i32* %arrayidx1
+  store i32 %load, i32 addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "target-cpu"="tahiti" }
+attributes #3 = { nounwind "target-cpu"="bonaire" }
+attributes #4 = { nounwind "target-cpu"="fiji" }
+attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-max-waves-per-eu"="3" }
+attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-max-waves-per-eu"="3" }
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 27 Jun 2016 20:48:03 +0000 (20:48 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 27 Jun 2016 20:48:03 +0000 (20:48 +0000)
lib/Target/AMDGPU/AMDGPUSubtarget.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUTargetMachine.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUTargetMachine.h		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h		patch \| blob \| history
test/CodeGen/AMDGPU/target-cpu.ll	[new file with mode: 0644]	patch \| blob