From: Artem Belevich Date: Tue, 3 Sep 2019 17:31:58 +0000 (+0000) Subject: [CUDA] Use activemask.b32 instruction to implement __activemask w/ CUDA-9.2+ X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4fbb9360b74b83cec392f205eb7df60838482304;p=clang [CUDA] Use activemask.b32 instruction to implement __activemask w/ CUDA-9.2+ vote.ballot instruction is gone in recent CUDA versions and vote.sync.ballot can not be used because it needs a thread mask parameter. Fortunately PTX 6.2 (introduced with CUDA-9.2) provides activemask.b32 instruction for this. Differential Revision: https://reviews.llvm.org/D66665 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@370792 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Headers/__clang_cuda_intrinsics.h b/lib/Headers/__clang_cuda_intrinsics.h index 2970d17f89..b67461a146 100644 --- a/lib/Headers/__clang_cuda_intrinsics.h +++ b/lib/Headers/__clang_cuda_intrinsics.h @@ -211,7 +211,15 @@ inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { return __nvvm_vote_ballot_sync(mask, pred); } -inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); } +inline __device__ unsigned int __activemask() { +#if CUDA_VERSION < 9020 + return __nvvm_vote_ballot(1); +#else + unsigned int mask; + asm volatile("activemask.b32 %0;" : "=r"(mask)); + return mask; +#endif +} inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) { return __nvvm_fns(mask, base, offset);