From 0974536c5b45dca4ee32b30ad8f77f10fca752a7 Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Thu, 21 Sep 2017 18:44:49 +0000 Subject: [PATCH] [NVPTX] Implemented bar.warp.sync, barrier.sync, and vote{.sync} instructions/intrinsics/builtins. Differential Revision: https://reviews.llvm.org/D38148 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@313898 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/BuiltinsNVPTX.def | 14 +++++++++ lib/Headers/__clang_cuda_intrinsics.h | 31 ++++++++++++++++++++ test/CodeGen/builtins-nvptx-ptx60.cu | 41 +++++++++++++++++++++++++-- test/CodeGen/builtins-nvptx.c | 12 ++++++++ 4 files changed, 96 insertions(+), 2 deletions(-) diff --git a/include/clang/Basic/BuiltinsNVPTX.def b/include/clang/Basic/BuiltinsNVPTX.def index 0cd4195b55..4cd839323d 100644 --- a/include/clang/Basic/BuiltinsNVPTX.def +++ b/include/clang/Basic/BuiltinsNVPTX.def @@ -378,6 +378,9 @@ BUILTIN(__nvvm_bar0_popc, "ii", "") BUILTIN(__nvvm_bar0_and, "ii", "") BUILTIN(__nvvm_bar0_or, "ii", "") BUILTIN(__nvvm_bar_sync, "vi", "n") +TARGET_BUILTIN(__nvvm_bar_warp_sync, "vUi", "n", "ptx60") +TARGET_BUILTIN(__nvvm_barrier_sync, "vUi", "n", "ptx60") +TARGET_BUILTIN(__nvvm_barrier_sync_cnt, "vUiUi", "n", "ptx60") // Shuffle @@ -399,6 +402,17 @@ TARGET_BUILTIN(__nvvm_shfl_sync_bfly_f32, "fUifii", "", "ptx60") TARGET_BUILTIN(__nvvm_shfl_sync_idx_i32, "iUiiii", "", "ptx60") TARGET_BUILTIN(__nvvm_shfl_sync_idx_f32, "fUifii", "", "ptx60") +// Vote +BUILTIN(__nvvm_vote_all, "bb", "") +BUILTIN(__nvvm_vote_any, "bb", "") +BUILTIN(__nvvm_vote_uni, "bb", "") +BUILTIN(__nvvm_vote_ballot, "Uib", "") + +TARGET_BUILTIN(__nvvm_vote_all_sync, "bUib", "", "ptx60") +TARGET_BUILTIN(__nvvm_vote_any_sync, "bUib", "", "ptx60") +TARGET_BUILTIN(__nvvm_vote_uni_sync, "bUib", "", "ptx60") +TARGET_BUILTIN(__nvvm_vote_ballot_sync, "UiUib", "", "ptx60") + // Membar BUILTIN(__nvvm_membar_cta, "v", "") diff --git a/lib/Headers/__clang_cuda_intrinsics.h b/lib/Headers/__clang_cuda_intrinsics.h index c191a53059..52dfe617aa 100644 --- a/lib/Headers/__clang_cuda_intrinsics.h +++ b/lib/Headers/__clang_cuda_intrinsics.h @@ -157,6 +157,37 @@ __MAKE_SYNC_SHUFFLES(__shfl_sync_xor, __nvvm_shfl_sync_bfly_i32, #pragma pop_macro("__MAKE_SYNC_SHUFFLES") +inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) { + return __nvvm_bar_warp_sync(mask); +} + +inline __device__ void __barrier_sync(unsigned int id) { + __nvvm_barrier_sync(id); +} + +inline __device__ void __barrier_sync_count(unsigned int id, + unsigned int count) { + __nvvm_barrier_sync_cnt(id, count); +} + +inline __device__ int __all_sync(unsigned int mask, int pred) { + return __nvvm_vote_sync_all(mask, pred); +} + +inline __device__ int __any_sync(unsigned int mask, int pred) { + return __nvvm_vote_sync_any(mask, pred); +} + +inline __device__ int __uni_sync(unsigned int mask, int pred) { + return __nvvm_vote_sync_uni(mask, pred); +} + +inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { + return __nvvm_vote_sync_ballot(mask, pred); +} + +inline __device__ activemask() { return __nvvm_vote.ballot(1); } + #endif // __CUDA_VERSION >= 9000 && (!defined(__CUDA_ARCH__) || // __CUDA_ARCH__ >= 300) diff --git a/test/CodeGen/builtins-nvptx-ptx60.cu b/test/CodeGen/builtins-nvptx-ptx60.cu index e06c84c150..5bd9ca3009 100644 --- a/test/CodeGen/builtins-nvptx-ptx60.cu +++ b/test/CodeGen/builtins-nvptx-ptx60.cu @@ -10,8 +10,27 @@ #define __shared__ __attribute__((shared)) #define __constant__ __attribute__((constant)) -// CHECK-LABEL: nvvm_shfl_sync -__device__ void nvvm_shfl_sync(unsigned mask, int i, float f, int a, int b) { +// We have to keep all builtins that depend on particular target feature in the +// same function, because the codegen will stop after the very first function +// that encounters an error, so -verify will not be able to find errors in +// subsequent functions. + +// CHECK-LABEL: nvvm_sync +__device__ void nvvm_sync(unsigned mask, int i, float f, int a, int b, + bool pred) { + // CHECK: call void @llvm.nvvm.bar.warp.sync(i32 + // expected-error@+1 {{'__nvvm_bar_warp_sync' needs target feature ptx60}} + __nvvm_bar_warp_sync(mask); + // CHECK: call void @llvm.nvvm.barrier.sync(i32 + // expected-error@+1 {{'__nvvm_barrier_sync' needs target feature ptx60}} + __nvvm_barrier_sync(mask); + // CHECK: call void @llvm.nvvm.barrier.sync.cnt(i32 + // expected-error@+1 {{'__nvvm_barrier_sync_cnt' needs target feature ptx60}} + __nvvm_barrier_sync_cnt(mask, i); + + // + // SHFL.SYNC + // // CHECK: call i32 @llvm.nvvm.shfl.sync.down.i32(i32 {{%[0-9]+}}, i32 // expected-error@+1 {{'__nvvm_shfl_sync_down_i32' needs target feature ptx60}} __nvvm_shfl_sync_down_i32(mask, i, a, b); @@ -36,5 +55,23 @@ __device__ void nvvm_shfl_sync(unsigned mask, int i, float f, int a, int b) { // CHECK: call float @llvm.nvvm.shfl.sync.idx.f32(i32 {{%[0-9]+}}, float // expected-error@+1 {{'__nvvm_shfl_sync_idx_f32' needs target feature ptx60}} __nvvm_shfl_sync_idx_f32(mask, f, a, b); + + // + // VOTE.SYNC + // + + // CHECK: call i1 @llvm.nvvm.vote.all.sync(i32 + // expected-error@+1 {{'__nvvm_vote_all_sync' needs target feature ptx60}} + __nvvm_vote_all_sync(mask, pred); + // CHECK: call i1 @llvm.nvvm.vote.any.sync(i32 + // expected-error@+1 {{'__nvvm_vote_any_sync' needs target feature ptx60}} + __nvvm_vote_any_sync(mask, pred); + // CHECK: call i1 @llvm.nvvm.vote.uni.sync(i32 + // expected-error@+1 {{'__nvvm_vote_uni_sync' needs target feature ptx60}} + __nvvm_vote_uni_sync(mask, pred); + // CHECK: call i32 @llvm.nvvm.vote.ballot.sync(i32 + // expected-error@+1 {{'__nvvm_vote_ballot_sync' needs target feature ptx60}} + __nvvm_vote_ballot_sync(mask, pred); + // CHECK: ret void } diff --git a/test/CodeGen/builtins-nvptx.c b/test/CodeGen/builtins-nvptx.c index c97b549cbe..89a982377a 100644 --- a/test/CodeGen/builtins-nvptx.c +++ b/test/CodeGen/builtins-nvptx.c @@ -657,3 +657,15 @@ __device__ void nvvm_shfl(int i, float f, int a, int b) { __nvvm_shfl_idx_f32(f, a, b); // CHECK: ret void } + +__device__ void nvvm_vote(int pred) { + // CHECK: call i1 @llvm.nvvm.vote.all(i1 + __nvvm_vote_all(pred); + // CHECK: call i1 @llvm.nvvm.vote.any(i1 + __nvvm_vote_any(pred); + // CHECK: call i1 @llvm.nvvm.vote.uni(i1 + __nvvm_vote_uni(pred); + // CHECK: call i32 @llvm.nvvm.vote.ballot(i1 + __nvvm_vote_ballot(pred); + // CHECK: ret void +} -- 2.40.0