From: Alexander Timofeev Date: Thu, 15 Jun 2017 19:33:10 +0000 (+0000) Subject: DivergencyAnalysis patch for review X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7807f69e9b26b8af6294f00cf5db525acca0be32;p=llvm DivergencyAnalysis patch for review git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@305494 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 471b8640cf7..af2ebb7b6b4 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -235,6 +235,11 @@ public: /// starting with the sources of divergence. bool isSourceOfDivergence(const Value *V) const; + // \brief Returns true for the target specific + // set of operations which produce uniform result + // even taking non-unform arguments + bool isAlwaysUniform(const Value *V) const; + /// Returns the address space ID for a target's 'flat' address space. Note /// this is not necessarily the same as addrspace(0), which LLVM sometimes /// refers to as the generic address space. The flat address space is a @@ -821,6 +826,7 @@ public: virtual int getUserCost(const User *U) = 0; virtual bool hasBranchDivergence() = 0; virtual bool isSourceOfDivergence(const Value *V) = 0; + virtual bool isAlwaysUniform(const Value *V) = 0; virtual unsigned getFlatAddressSpace() = 0; virtual bool isLoweredToCall(const Function *F) = 0; virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) = 0; @@ -998,6 +1004,10 @@ public: return Impl.isSourceOfDivergence(V); } + bool isAlwaysUniform(const Value *V) override { + return Impl.isAlwaysUniform(V); + } + unsigned getFlatAddressSpace() override { return Impl.getFlatAddressSpace(); } diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index 7884ffe8cc2..24ac3b1213e 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -177,6 +177,8 @@ public: bool isSourceOfDivergence(const Value *V) { return false; } + bool isAlwaysUniform(const Value *V) { return false; } + unsigned getFlatAddressSpace () { return -1; } diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h index 75277cddcfe..5eb7a0f61ee 100644 --- a/include/llvm/CodeGen/BasicTTIImpl.h +++ b/include/llvm/CodeGen/BasicTTIImpl.h @@ -93,6 +93,8 @@ public: bool isSourceOfDivergence(const Value *V) { return false; } + bool isAlwaysUniform(const Value *V) { return false; } + unsigned getFlatAddressSpace() { // Return an invalid address space. return -1; diff --git a/lib/Analysis/DivergenceAnalysis.cpp b/lib/Analysis/DivergenceAnalysis.cpp index 1b36569f7a0..2d39a0b0215 100644 --- a/lib/Analysis/DivergenceAnalysis.cpp +++ b/lib/Analysis/DivergenceAnalysis.cpp @@ -241,7 +241,7 @@ void DivergencePropagator::exploreDataDependency(Value *V) { // Follow def-use chains of V. for (User *U : V->users()) { Instruction *UserInst = cast(U); - if (DV.insert(UserInst).second) + if (!TTI.isAlwaysUniform(U) && DV.insert(UserInst).second) Worklist.push_back(UserInst); } } diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 488cb332a0b..92328f6e5ef 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -103,6 +103,10 @@ bool TargetTransformInfo::isSourceOfDivergence(const Value *V) const { return TTIImpl->isSourceOfDivergence(V); } +bool llvm::TargetTransformInfo::isAlwaysUniform(const Value *V) const { + return TTIImpl->isAlwaysUniform(V); +} + unsigned TargetTransformInfo::getFlatAddressSpace() const { return TTIImpl->getFlatAddressSpace(); } diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index 3c788fa1dce..6f002860044 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -107,7 +107,7 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) { DFS(Start, Checklist); for (auto &BB : Checklist) { - BasicBlock::iterator StartIt = (BB == Load->getParent()) ? + BasicBlock::iterator StartIt = (!L && (BB == Load->getParent())) ? BasicBlock::iterator(Load) : BB->end(); if (MDR->getPointerDependencyFrom(MemoryLocation(Ptr), true, StartIt, BB, Load).isClobber()) diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 65dba7d6055..0d6689bd04c 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -489,6 +489,19 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { return false; } +bool AMDGPUTTIImpl::isAlwaysUniform(const Value *V) const { + if (const IntrinsicInst *Intrinsic = dyn_cast(V)) { + switch (Intrinsic->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: + return true; + } + } + return false; +} + unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { if (ST->hasVOP3PInsts()) { diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 8466f118395..a60b1bb1b59 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -103,6 +103,7 @@ public: int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); bool isSourceOfDivergence(const Value *V) const; + bool isAlwaysUniform(const Value *V) const; unsigned getFlatAddressSpace() const { // Don't bother running InferAddressSpaces pass on graphics shaders which diff --git a/test/CodeGen/AMDGPU/always-uniform.ll b/test/CodeGen/AMDGPU/always-uniform.ll new file mode 100644 index 00000000000..4ba57fba81b --- /dev/null +++ b/test/CodeGen/AMDGPU/always-uniform.ll @@ -0,0 +1,21 @@ +; RUN: llc -mtriple amdgcn-amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.readfirstlane(i32) + +; GCN-LABEL: readfirstlane_uniform +; GCN: s_load_dwordx2 s{{\[}}[[IN_ADDR:[0-9]+]]:1{{\]}}, s[4:5], 0x0 +; GCN: v_readfirstlane_b32 s[[SCALAR:[0-9]+]], v0 +; GCN: s_add_u32 s[[LOAD_ADDR:[0-9]+]], s[[IN_ADDR]], s[[SCALAR]] +; GCN: s_load_dword s{{[0-9]+}}, s{{\[}}[[LOAD_ADDR]] + +define amdgpu_kernel void @readfirstlane_uniform(float addrspace(1)* noalias nocapture readonly, float addrspace(1)* noalias nocapture readonly) { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + %scalar = tail call i32 @llvm.amdgcn.readfirstlane(i32 %tid) + %idx = zext i32 %scalar to i64 + %gep0 = getelementptr inbounds float, float addrspace(1)* %0, i64 %idx + %val = load float, float addrspace(1)* %gep0, align 4 + %gep1 = getelementptr inbounds float, float addrspace(1)* %1, i64 10 + store float %val, float addrspace(1)* %gep1, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/global_smrd_cfg.ll b/test/CodeGen/AMDGPU/global_smrd_cfg.ll index a6a04151caa..be6e3fd05ae 100644 --- a/test/CodeGen/AMDGPU/global_smrd_cfg.ll +++ b/test/CodeGen/AMDGPU/global_smrd_cfg.ll @@ -72,6 +72,39 @@ bb22: ; preds = %bb20, %bb11 br i1 %tmp31, label %bb7, label %bb11 } +; one more test to ensure that aliasing store after the load +; is considered clobbering if load parent block is the same +; as a loop header block. + +; CHECK-LABEL: %bb1 + +; Load from %arg has alias store that is after the load +; but is considered clobbering because of the loop. + +; CHECK: flat_load_dword + +define amdgpu_kernel void @cfg_selfloop(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) #0 { +bb: + br label %bb1 + +bb2: + ret void + +bb1: + %tmp13 = phi i32 [ %tmp25, %bb1 ], [ 0, %bb ] + %tmp14 = srem i32 %tmp13, %arg2 + %tmp15 = sext i32 %tmp14 to i64 + %tmp16 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp15 + %tmp17 = load i32, i32 addrspace(1)* %tmp16, align 4, !tbaa !0 + %tmp19 = sext i32 %tmp13 to i64 + %tmp21 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp19 + store i32 %tmp17, i32 addrspace(1)* %tmp21, align 4, !tbaa !0 + %tmp25 = add nuw nsw i32 %tmp13, 1 + %tmp31 = icmp eq i32 %tmp25, 100 + br i1 %tmp31, label %bb2, label %bb1 +} + + attributes #0 = { "target-cpu"="fiji" } !0 = !{!1, !1, i64 0}