[CUDA] Use activemask.b32 instruction to implement __activemask w/ CUDA-9.2+

author Artem Belevich <tra@google.com>

Tue, 3 Sep 2019 17:31:58 +0000 (17:31 +0000)

committer Artem Belevich <tra@google.com>

Tue, 3 Sep 2019 17:31:58 +0000 (17:31 +0000)
author Artem Belevich <tra@google.com>
Tue, 3 Sep 2019 17:31:58 +0000 (17:31 +0000)
committer Artem Belevich <tra@google.com>
Tue, 3 Sep 2019 17:31:58 +0000 (17:31 +0000)
diff --git a/lib/Headers/__clang_cuda_intrinsics.h b/lib/Headers/__clang_cuda_intrinsics.h

index 2970d17f89ee5cae199ce9775b901dd1f075847c..b67461a146fc679ee695e9e02525dcd863d0121e 100644 (file)
--- a/lib/Headers/__clang_cuda_intrinsics.h
+++ b/lib/Headers/__clang_cuda_intrinsics.h
@@ -211,7 +211,15 @@ inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
    return __nvvm_vote_ballot_sync(mask, pred);
  }
  
-inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); }
+inline __device__ unsigned int __activemask() {
+#if CUDA_VERSION < 9020
+  return __nvvm_vote_ballot(1);
+#else
+  unsigned int mask;
+  asm volatile("activemask.b32 %0;" : "=r"(mask));
+  return mask;
+#endif
+}
  
  inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) {
    return __nvvm_fns(mask, base, offset);
author	Artem Belevich <tra@google.com>
	Tue, 3 Sep 2019 17:31:58 +0000 (17:31 +0000)
committer	Artem Belevich <tra@google.com>
	Tue, 3 Sep 2019 17:31:58 +0000 (17:31 +0000)