From: Stanislav Mekhanoshin Date: Mon, 17 Jun 2019 17:57:50 +0000 (+0000) Subject: [AMDGPU] gfx1010 wavefrontsize intrinsic folding X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=95dd0d84d21ea397636287892bf0df588711566a;p=llvm [AMDGPU] gfx1010 wavefrontsize intrinsic folding Differential Revision: https://reviews.llvm.org/D63206 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363588 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 94dad0d7470..19a7b38004c 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -53,7 +53,8 @@ FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createSIPreAllocateWWMRegsPass(); FunctionPass *createSIFormMemoryClausesPass(); -FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &); +FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &, + const TargetMachine *); FunctionPass *createAMDGPUUseNativeCallsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(); FunctionPass *createAMDGPUMachineCFGStructurizerPass(); diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 581e229b4a0..7156824638a 100644 --- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPULibFunc.h" +#include "AMDGPUSubtarget.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/ADT/StringSet.h" @@ -22,6 +23,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" @@ -29,6 +31,7 @@ #include "llvm/IR/ValueSymbolTable.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include #include @@ -65,6 +68,8 @@ private: typedef llvm::AMDGPULibFunc FuncInfo; + const TargetMachine *TM; + // -fuse-native. bool AllNative = false; @@ -134,6 +139,9 @@ private: // __read_pipe/__write_pipe bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B, FuncInfo &FInfo); + // llvm.amdgcn.wavefrontsize + bool fold_wavefrontsize(CallInst *CI, IRBuilder<> &B); + // Get insertion point at entry. BasicBlock::iterator getEntryIns(CallInst * UI); // Insert an Alloc instruction. @@ -152,6 +160,8 @@ protected: } public: + AMDGPULibCalls(const TargetMachine *TM_ = nullptr) : TM(TM_) {} + bool fold(CallInst *CI, AliasAnalysis *AA = nullptr); void initNativeFuncs(); @@ -166,15 +176,16 @@ namespace { class AMDGPUSimplifyLibCalls : public FunctionPass { - AMDGPULibCalls Simplifier; - const TargetOptions Options; + AMDGPULibCalls Simplifier; + public: static char ID; // Pass identification - AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions()) - : FunctionPass(ID), Options(Opt) { + AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions(), + const TargetMachine *TM = nullptr) + : FunctionPass(ID), Options(Opt), Simplifier(TM) { initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); } @@ -639,14 +650,6 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) { // Ignore indirect calls. if (Callee == 0) return false; - FuncInfo FInfo; - if (!parseFunctionName(Callee->getName(), &FInfo)) - return false; - - // Further check the number of arguments to see if they match. - if (CI->getNumArgOperands() != FInfo.getNumArgs()) - return false; - BasicBlock *BB = CI->getParent(); LLVMContext &Context = CI->getParent()->getContext(); IRBuilder<> B(Context); @@ -658,6 +661,21 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) { if (const FPMathOperator *FPOp = dyn_cast(CI)) B.setFastMathFlags(FPOp->getFastMathFlags()); + switch (Callee->getIntrinsicID()) { + default: + break; + case Intrinsic::amdgcn_wavefrontsize: + return !EnablePreLink && fold_wavefrontsize(CI, B); + } + + FuncInfo FInfo; + if (!parseFunctionName(Callee->getName(), &FInfo)) + return false; + + // Further check the number of arguments to see if they match. + if (CI->getNumArgOperands() != FInfo.getNumArgs()) + return false; + if (TDOFold(CI, FInfo)) return true; @@ -1371,6 +1389,29 @@ bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B, return true; } +bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) { + if (!TM) + return false; + + StringRef CPU = TM->getTargetCPU(); + StringRef Features = TM->getTargetFeatureString(); + if ((CPU.empty() || CPU.equals_lower("generic")) && + (Features.empty() || + Features.find_lower("wavefrontsize") == StringRef::npos)) + return false; + + Function *F = CI->getParent()->getParent(); + const GCNSubtarget &ST = TM->getSubtarget(*F); + unsigned N = ST.getWavefrontSize(); + + LLVM_DEBUG(errs() << "AMDIC: fold_wavefrontsize (" << *CI << ") with " + << N << "\n"); + + CI->replaceAllUsesWith(ConstantInt::get(B.getInt32Ty(), N)); + CI->eraseFromParent(); + return true; +} + // Get insertion point at entry. BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) { Function * Func = UI->getParent()->getParent(); @@ -1680,8 +1721,9 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, FuncInfo &FInfo) { } // Public interface to the Simplify LibCalls pass. -FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt) { - return new AMDGPUSimplifyLibCalls(Opt); +FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt, + const TargetMachine *TM) { + return new AMDGPUSimplifyLibCalls(Opt, TM); } FunctionPass *llvm::createAMDGPUUseNativeCallsPass() { diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 41a075756ee..ae422940b25 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -432,7 +432,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this)); PM.add(llvm::createAMDGPUUseNativeCallsPass()); if (LibCallSimplify) - PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt)); + PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this)); }); Builder.addExtension( diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll new file mode 100644 index 00000000000..806673b5841 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll @@ -0,0 +1,84 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s + +; RUN: opt -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-WXX %s +; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-WXX %s +; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+WavefrontSize32 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+WavefrontSize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32,-wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W32 %s +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=-wavefrontsize32,+wavefrontsize64 -S < %s | FileCheck -check-prefixes=OPT,OPT-W64 %s + +; GCN-LABEL: {{^}}fold_wavefrontsize: +; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize( + +; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32 +; W64: v_mov_b32_e32 [[V:v[0-9]+]], 64 +; GCN: store_dword v[{{[0-9:]+}}], [[V]] + +; OPT-W32: store i32 32, i32 addrspace(1)* %arg, align 4 +; OPT-W64: store i32 64, i32 addrspace(1)* %arg, align 4 +; OPT-WXX: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() +; OPT-WXX: store i32 %tmp, i32 addrspace(1)* %arg, align 4 +; OPT-NEXT: ret void + +define amdgpu_kernel void @fold_wavefrontsize(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + store i32 %tmp, i32 addrspace(1)* %arg, align 4 + ret void +} + +; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize: +; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize( + +; W32: v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}} +; W64: v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}} +; GCN-NOT: cndmask +; GCN: store_dword v[{{[0-9:]+}}], [[V]] + +; OPT-W32: store i32 1, i32 addrspace(1)* %arg, align 4 +; OPT-W64: store i32 2, i32 addrspace(1)* %arg, align 4 +; OPT-WXX: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() +; OPT-WXX: %tmp1 = icmp ugt i32 %tmp, 32 +; OPT-WXX: %tmp2 = select i1 %tmp1, i32 2, i32 1 +; OPT-WXX: store i32 %tmp2, i32 addrspace(1)* %arg +; OPT-NEXT: ret void + +define amdgpu_kernel void @fold_and_optimize_wavefrontsize(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + %tmp1 = icmp ugt i32 %tmp, 32 + %tmp2 = select i1 %tmp1, i32 2, i32 1 + store i32 %tmp2, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize: +; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize( + +; OPT: bb: +; OPT-WXX: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() +; OPT-WXX: %tmp1 = icmp ugt i32 %tmp, 32 +; OPT-WXX: bb3: +; OPT-W64: store i32 1, i32 addrspace(1)* %arg, align 4 +; OPT-NEXT: ret void + +define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0 + %tmp1 = icmp ugt i32 %tmp, 32 + br i1 %tmp1, label %bb2, label %bb3 + +bb2: ; preds = %bb + store i32 1, i32 addrspace(1)* %arg, align 4 + br label %bb3 + +bb3: ; preds = %bb2, %bb + ret void +} + +declare i32 @llvm.amdgcn.wavefrontsize() #0 + +attributes #0 = { nounwind readnone speculatable }