From 45b902e6896fcf9db3e79c1e4a49424cb67c8447 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 13 Oct 2016 20:52:12 +0000
Subject: [PATCH] [CUDA] Emit deferred diagnostics during Sema rather than
 during codegen.

Summary:
Emitting deferred diagnostics during codegen was a hack.  It did work,
but usability was poor, both for us as compiler devs and for users.  We
don't codegen if there are any sema errors, so for users this meant that
they wouldn't see deferred errors if there were any non-deferred errors.
For devs, this meant that we had to carefully split up our tests so that
when we tested deferred errors, we didn't emit any non-deferred errors.

This change moves checking for deferred errors into Sema.  See the big
comment in SemaCUDA.cpp for an overview of the idea.

This checking adds overhead to compilation, because we have to maintain
a partial call graph.  As a result, this change makes deferred errors a
CUDA-only concept (whereas before they were a general concept).  If
anyone else wants to use this framework for something other than CUDA,
we can generalize at that time.

This patch makes the minimal set of test changes -- after this lands,
I'll go back through and do a cleanup of the tests that we no longer
have to split up.

Reviewers: rnk

Subscribers: cfe-commits, rsmith, tra

Differential Revision: https://reviews.llvm.org/D25541

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@284158 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/clang/AST/ASTContext.h            |  11 --
 include/clang/AST/Decl.h                  |   8 -
 include/clang/Sema/Sema.h                 |  43 +++--
 lib/AST/Decl.cpp                          |  14 --
 lib/CodeGen/CodeGenModule.cpp             |  44 -----
 lib/CodeGen/CodeGenModule.h               |   4 -
 lib/Sema/SemaCUDA.cpp                     | 211 ++++++++++++++++++----
 test/Parser/lambda-attr.cu                |   2 +-
 test/SemaCUDA/call-host-fn-from-device.cu |   3 +-
 test/SemaCUDA/function-overload.cu        |  44 ++++-
 test/SemaCUDA/method-target.cu            |   6 +-
 test/SemaCUDA/reference-to-kernel-fn.cu   |   5 +-
 12 files changed, 254 insertions(+), 141 deletions(-)
diff --git a/include/clang/AST/ASTContext.h b/include/clang/AST/ASTContext.h
index 711374b615..3d777524b8 100644
--- a/include/clang/AST/ASTContext.h
+++ b/include/clang/AST/ASTContext.h
@@ -448,12 +448,6 @@ private:
   /// \brief Allocator for partial diagnostics.
   PartialDiagnostic::StorageAllocator DiagAllocator;
 
-  /// Diagnostics that are emitted if and only if the given function is
-  /// codegen'ed.  Access these through FunctionDecl::addDeferredDiag() and
-  /// FunctionDecl::takeDeferredDiags().
-  llvm::DenseMap<const FunctionDecl *, std::vector<PartialDiagnosticAt>>
-      DeferredDiags;
-
   /// \brief The current C++ ABI.
   std::unique_ptr<CXXABI> ABI;
   CXXABI *createCXXABI(const TargetInfo &T);
@@ -604,11 +598,6 @@ public:
     return DiagAllocator;
   }
 
-  decltype(DeferredDiags) &getDeferredDiags() { return DeferredDiags; }
-  const decltype(DeferredDiags) &getDeferredDiags() const {
-    return DeferredDiags;
-  }
-
   const TargetInfo &getTargetInfo() const { return *Target; }
   const TargetInfo *getAuxTargetInfo() const { return AuxTarget; }
 
diff --git a/include/clang/AST/Decl.h b/include/clang/AST/Decl.h
index 90d87272d4..d24b37af61 100644
--- a/include/clang/AST/Decl.h
+++ b/include/clang/AST/Decl.h
@@ -2271,14 +2271,6 @@ public:
   /// returns 0.
   unsigned getMemoryFunctionKind() const;
 
-  /// Add a diagnostic to be emitted if and when this function is codegen'ed.
-  void addDeferredDiag(PartialDiagnosticAt PD);
-
-  /// Gets this object's list of deferred diagnostics, if there are any.
-  ///
-  /// Although this is logically const, it clears our list of deferred diags.
-  std::vector<PartialDiagnosticAt> takeDeferredDiags() const;
-
   // Implement isa/cast/dyncast/etc.
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
   static bool classofKind(Kind K) {
diff --git a/include/clang/Sema/Sema.h b/include/clang/Sema/Sema.h
index 1417b23b9a..d0f62d9d37 100644
--- a/include/clang/Sema/Sema.h
+++ b/include/clang/Sema/Sema.h
@@ -9245,6 +9245,30 @@ public:
   /// before incrementing, so you can emit an error.
   bool PopForceCUDAHostDevice();
 
+  /// Diagnostics that are emitted only if we discover that the given function
+  /// must be codegen'ed.  Because handling these correctly adds overhead to
+  /// compilation, this is currently only enabled for CUDA compilations.
+  llvm::DenseMap<const FunctionDecl *, std::vector<PartialDiagnosticAt>>
+      CUDADeferredDiags;
+
+  /// Raw encodings of SourceLocations for which CheckCUDACall has emitted a
+  /// (maybe deferred) "bad call" diagnostic.  We use this to avoid emitting the
+  /// same deferred diag twice.
+  llvm::DenseSet<unsigned> LocsWithCUDACallDiags;
+
+  /// The set of CUDA functions that we've discovered must be emitted by tracing
+  /// the call graph.  Functions that we can tell a priori must be emitted
+  /// aren't added to this set.
+  llvm::DenseSet<FunctionDecl *> CUDAKnownEmittedFns;
+
+  /// A partial call graph maintained during CUDA compilation to support
+  /// deferred diagnostics.  Specifically, functions are only added here if, at
+  /// the time they're added, they are not known-emitted.  As soon as we
+  /// discover that a function is known-emitted, we remove it and everything it
+  /// transitively calls from this set and add those functions to
+  /// CUDAKnownEmittedFns.
+  llvm::DenseMap<FunctionDecl *, llvm::SetVector<FunctionDecl *>> CUDACallGraph;
+
   /// Diagnostic builder for CUDA errors which may or may not be deferred.
   ///
   /// In CUDA, there exist constructs (e.g. variable-length arrays, try/catch)
@@ -9298,12 +9322,15 @@ public:
 
   private:
     struct PartialDiagnosticInfo {
-      PartialDiagnosticInfo(SourceLocation Loc, PartialDiagnostic PD,
+      PartialDiagnosticInfo(Sema &S, SourceLocation Loc, PartialDiagnostic PD,
                             FunctionDecl *Fn)
-          : Loc(Loc), PD(std::move(PD)), Fn(Fn) {}
+          : S(S), Loc(Loc), PD(std::move(PD)), Fn(Fn) {}
 
-      ~PartialDiagnosticInfo() { Fn->addDeferredDiag({Loc, std::move(PD)}); }
+      ~PartialDiagnosticInfo() {
+        S.CUDADeferredDiags[Fn].push_back({Loc, std::move(PD)});
+      }
 
+      Sema &S;
       SourceLocation Loc;
       PartialDiagnostic PD;
       FunctionDecl *Fn;
@@ -9322,8 +9349,8 @@ public:
   /// - If CurContext is a __device__ or __global__ function, emits the
   ///   diagnostics immediately.
   /// - If CurContext is a __host__ __device__ function and we are compiling for
-  ///   the device, creates a deferred diagnostic which is emitted if and when
-  ///   the function is codegen'ed.
+  ///   the device, creates a diagnostic which is emitted if and when we realize
+  ///   that the function will be codegen'ed.
   ///
   /// Example usage:
   ///
@@ -9397,12 +9424,6 @@ public:
   void maybeAddCUDAHostDeviceAttrs(Scope *S, FunctionDecl *FD,
                                    const LookupResult &Previous);
 
-private:
-  /// Raw encodings of SourceLocations for which CheckCUDACall has emitted a
-  /// (maybe deferred) "bad call" diagnostic.  We use this to avoid emitting the
-  /// same deferred diag twice.
-  llvm::DenseSet<unsigned> LocsWithCUDACallDiags;
-
 public:
   /// Check whether we're allowed to call Callee from the current context.
   ///
diff --git a/lib/AST/Decl.cpp b/lib/AST/Decl.cpp
index e330e31d47..657d2e01dc 100644
--- a/lib/AST/Decl.cpp
+++ b/lib/AST/Decl.cpp
@@ -3473,20 +3473,6 @@ unsigned FunctionDecl::getMemoryFunctionKind() const {
   return 0;
 }
 
-void FunctionDecl::addDeferredDiag(PartialDiagnosticAt PD) {
-  getASTContext().getDeferredDiags()[this].push_back(std::move(PD));
-}
-
-std::vector<PartialDiagnosticAt> FunctionDecl::takeDeferredDiags() const {
-  auto &DD = getASTContext().getDeferredDiags();
-  auto It = DD.find(this);
-  if (It == DD.end())
-    return {};
-  auto Ret = std::move(It->second);
-  DD.erase(It);
-  return Ret;
-}
-
 //===----------------------------------------------------------------------===//
 // FieldDecl Implementation
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/CodeGenModule.cpp b/lib/CodeGen/CodeGenModule.cpp
index 281da3fd65..f4628fc0d3 100644
--- a/lib/CodeGen/CodeGenModule.cpp
+++ b/lib/CodeGen/CodeGenModule.cpp
@@ -499,19 +499,6 @@ void CodeGenModule::Release() {
   EmitVersionIdentMetadata();
 
   EmitTargetMetadata();
-
-  // Emit any deferred diagnostics gathered during codegen.  We didn't emit them
-  // when we first discovered them because that would have halted codegen,
-  // preventing us from gathering other deferred diags.
-  for (const PartialDiagnosticAt &DiagAt : DeferredDiags) {
-    SourceLocation Loc = DiagAt.first;
-    const PartialDiagnostic &PD = DiagAt.second;
-    DiagnosticBuilder Builder(getDiags().Report(Loc, PD.getDiagID()));
-    PD.Emit(Builder);
-  }
-  // Clear the deferred diags so they don't outlive the ASTContext's
-  // PartialDiagnostic allocator.
-  DeferredDiags.clear();
 }
 
 void CodeGenModule::UpdateCompletedType(const TagDecl *TD) {
@@ -2913,37 +2900,6 @@ void CodeGenModule::EmitGlobalFunctionDefinition(GlobalDecl GD,
                                                  llvm::GlobalValue *GV) {
   const auto *D = cast<FunctionDecl>(GD.getDecl());
 
-  // Emit this function's deferred diagnostics, if none of them are errors.  If
-  // any of them are errors, don't codegen the function, but also don't emit any
-  // of the diagnostics just yet.  Emitting an error during codegen stops
-  // further codegen, and we want to display as many deferred diags as possible.
-  // We'll emit the now twice-deferred diags at the very end of codegen.
-  //
-  // (If a function has both error and non-error diags, we don't emit the
-  // non-error diags here, because order can be significant, e.g. with notes
-  // that follow errors.)
-  auto Diags = D->takeDeferredDiags();
-  if (auto *Templ = D->getPrimaryTemplate()) {
-    auto TemplDiags = Templ->getAsFunction()->takeDeferredDiags();
-    Diags.insert(Diags.end(), TemplDiags.begin(), TemplDiags.end());
-  }
-  bool HasError = llvm::any_of(Diags, [this](const PartialDiagnosticAt &PDAt) {
-    return getDiags().getDiagnosticLevel(PDAt.second.getDiagID(), PDAt.first) >=
-           DiagnosticsEngine::Error;
-  });
-  if (HasError) {
-    DeferredDiags.insert(DeferredDiags.end(),
-                         std::make_move_iterator(Diags.begin()),
-                         std::make_move_iterator(Diags.end()));
-    return;
-  }
-  for (PartialDiagnosticAt &PDAt : Diags) {
-    const SourceLocation &Loc = PDAt.first;
-    const PartialDiagnostic &PD = PDAt.second;
-    DiagnosticBuilder Builder(getDiags().Report(Loc, PD.getDiagID()));
-    PD.Emit(Builder);
-  }
-
   // Compute the function info and LLVM type.
   const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD);
   llvm::FunctionType *Ty = getTypes().GetFunctionType(FI);
diff --git a/lib/CodeGen/CodeGenModule.h b/lib/CodeGen/CodeGenModule.h
index c19e5bcdc1..e896b11a7a 100644
--- a/lib/CodeGen/CodeGenModule.h
+++ b/lib/CodeGen/CodeGenModule.h
@@ -490,10 +490,6 @@ private:
   /// MDNodes.
   llvm::DenseMap<QualType, llvm::Metadata *> MetadataIdMap;
 
-  /// Diags gathered from FunctionDecl::takeDeferredDiags().  Emitted at the
-  /// very end of codegen.
-  std::vector<std::pair<SourceLocation, PartialDiagnostic>> DeferredDiags;
-
 public:
   CodeGenModule(ASTContext &C, const HeaderSearchOptions &headersearchopts,
                 const PreprocessorOptions &ppopts,
diff --git a/lib/Sema/SemaCUDA.cpp b/lib/Sema/SemaCUDA.cpp
index 717fe4ad52..5333a44275 100644
--- a/lib/Sema/SemaCUDA.cpp
+++ b/lib/Sema/SemaCUDA.cpp
@@ -499,27 +499,84 @@ Sema::CUDADiagBuilder::CUDADiagBuilder(Kind K, SourceLocation Loc,
     break;
   case K_Deferred:
     assert(Fn && "Must have a function to attach the deferred diag to.");
-    PartialDiagInfo.emplace(Loc, S.PDiag(DiagID), Fn);
+    PartialDiagInfo.emplace(S, Loc, S.PDiag(DiagID), Fn);
     break;
   }
 }
 
+// In CUDA, there are some constructs which may appear in semantically-valid
+// code, but trigger errors if we ever generate code for the function in which
+// they appear.  Essentially every construct you're not allowed to use on the
+// device falls into this category, because you are allowed to use these
+// constructs in a __host__ __device__ function, but only if that function is
+// never codegen'ed on the device.
+//
+// To handle semantic checking for these constructs, we keep track of the set of
+// functions we know will be emitted, either because we could tell a priori that
+// they would be emitted, or because they were transitively called by a
+// known-emitted function.
+//
+// We also keep a partial call graph of which not-known-emitted functions call
+// which other not-known-emitted functions.
+//
+// When we see something which is illegal if the current function is emitted
+// (usually by way of CUDADiagIfDeviceCode, CUDADiagIfHostCode, or
+// CheckCUDACall), we first check if the current function is known-emitted.  If
+// so, we immediately output the diagnostic.
+//
+// Otherwise, we "defer" the diagnostic.  It sits in Sema::CUDADeferredDiags
+// until we discover that the function is known-emitted, at which point we take
+// it out of this map and emit the diagnostic.
+
+// Do we know that we will eventually codegen the given function?
+static bool IsKnownEmitted(Sema &S, FunctionDecl *FD) {
+  // Templates are emitted when they're instantiated.
+  if (FD->isDependentContext())
+    return false;
+
+  // When compiling for device, host functions are never emitted.  Similarly,
+  // when compiling for host, device and global functions are never emitted.
+  // (Technically, we do emit a host-side stub for global functions, but this
+  // doesn't count for our purposes here.)
+  Sema::CUDAFunctionTarget T = S.IdentifyCUDATarget(FD);
+  if (S.getLangOpts().CUDAIsDevice && T == Sema::CFT_Host)
+    return false;
+  if (!S.getLangOpts().CUDAIsDevice &&
+      (T == Sema::CFT_Device || T == Sema::CFT_Global))
+    return false;
+
+  // Externally-visible and similar functions are always emitted.
+  if (S.getASTContext().GetGVALinkageForFunction(FD) > GVA_DiscardableODR)
+    return true;
+
+  // Otherwise, the function is known-emitted if it's in our set of
+  // known-emitted functions.
+  return S.CUDAKnownEmittedFns.count(FD) > 0;
+}
+
 Sema::CUDADiagBuilder Sema::CUDADiagIfDeviceCode(SourceLocation Loc,
                                                  unsigned DiagID) {
   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
-  CUDADiagBuilder::Kind DiagKind;
-  switch (CurrentCUDATarget()) {
-  case CFT_Global:
-  case CFT_Device:
-    DiagKind = CUDADiagBuilder::K_Immediate;
-    break;
-  case CFT_HostDevice:
-    DiagKind = getLangOpts().CUDAIsDevice ? CUDADiagBuilder::K_Deferred
-                                          : CUDADiagBuilder::K_Nop;
-    break;
-  default:
-    DiagKind = CUDADiagBuilder::K_Nop;
-  }
+  CUDADiagBuilder::Kind DiagKind = [&] {
+    switch (CurrentCUDATarget()) {
+    case CFT_Global:
+    case CFT_Device:
+      return CUDADiagBuilder::K_Immediate;
+    case CFT_HostDevice:
+      // An HD function counts as host code if we're compiling for host, and
+      // device code if we're compiling for device.  Defer any errors in device
+      // mode until the function is known-emitted.
+      if (getLangOpts().CUDAIsDevice) {
+        return IsKnownEmitted(*this, dyn_cast<FunctionDecl>(CurContext))
+                   ? CUDADiagBuilder::K_Immediate
+                   : CUDADiagBuilder::K_Deferred;
+      }
+      return CUDADiagBuilder::K_Nop;
+
+    default:
+      return CUDADiagBuilder::K_Nop;
+    }
+  }();
   return CUDADiagBuilder(DiagKind, Loc, DiagID,
                          dyn_cast<FunctionDecl>(CurContext), *this);
 }
@@ -527,41 +584,119 @@ Sema::CUDADiagBuilder Sema::CUDADiagIfDeviceCode(SourceLocation Loc,
 Sema::CUDADiagBuilder Sema::CUDADiagIfHostCode(SourceLocation Loc,
                                                unsigned DiagID) {
   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
-  CUDADiagBuilder::Kind DiagKind;
-  switch (CurrentCUDATarget()) {
-  case CFT_Host:
-    DiagKind = CUDADiagBuilder::K_Immediate;
-    break;
-  case CFT_HostDevice:
-    DiagKind = getLangOpts().CUDAIsDevice ? CUDADiagBuilder::K_Nop
-                                          : CUDADiagBuilder::K_Deferred;
-    break;
-  default:
-    DiagKind = CUDADiagBuilder::K_Nop;
-  }
+  CUDADiagBuilder::Kind DiagKind = [&] {
+    switch (CurrentCUDATarget()) {
+    case CFT_Host:
+      return CUDADiagBuilder::K_Immediate;
+    case CFT_HostDevice:
+      // An HD function counts as host code if we're compiling for host, and
+      // device code if we're compiling for device.  Defer any errors in device
+      // mode until the function is known-emitted.
+      if (getLangOpts().CUDAIsDevice)
+        return CUDADiagBuilder::K_Nop;
+
+      return IsKnownEmitted(*this, dyn_cast<FunctionDecl>(CurContext))
+                 ? CUDADiagBuilder::K_Immediate
+                 : CUDADiagBuilder::K_Deferred;
+    default:
+      return CUDADiagBuilder::K_Nop;
+    }
+  }();
   return CUDADiagBuilder(DiagKind, Loc, DiagID,
                          dyn_cast<FunctionDecl>(CurContext), *this);
 }
 
+// Emit any deferred diagnostics for FD and erase them from the map in which
+// they're stored.
+static void EmitDeferredDiags(Sema &S, FunctionDecl *FD) {
+  auto It = S.CUDADeferredDiags.find(FD);
+  if (It == S.CUDADeferredDiags.end())
+    return;
+  for (PartialDiagnosticAt &PDAt : It->second) {
+    const SourceLocation &Loc = PDAt.first;
+    const PartialDiagnostic &PD = PDAt.second;
+    DiagnosticBuilder Builder(S.Diags.Report(Loc, PD.getDiagID()));
+    Builder.setForceEmit();
+    PD.Emit(Builder);
+  }
+  S.CUDADeferredDiags.erase(It);
+}
+
+// Indicate that this function (and thus everything it transtively calls) will
+// be codegen'ed, and emit any deferred diagnostics on this function and its
+// (transitive) callees.
+static void MarkKnownEmitted(Sema &S, FunctionDecl *FD) {
+  // Nothing to do if we already know that FD is emitted.
+  if (IsKnownEmitted(S, FD)) {
+    assert(!S.CUDACallGraph.count(FD));
+    return;
+  }
+
+  // We've just discovered that FD is known-emitted.  Walk our call graph to see
+  // what else we can now discover also must be emitted.
+  llvm::SmallVector<FunctionDecl *, 4> Worklist = {FD};
+  llvm::SmallSet<FunctionDecl *, 4> Seen;
+  Seen.insert(FD);
+  while (!Worklist.empty()) {
+    FunctionDecl *Caller = Worklist.pop_back_val();
+    assert(!IsKnownEmitted(S, Caller) &&
+           "Worklist should not contain known-emitted functions.");
+    S.CUDAKnownEmittedFns.insert(Caller);
+    EmitDeferredDiags(S, Caller);
+
+    // Deferred diags are often emitted on the template itself, so emit those as
+    // well.
+    if (auto *Templ = Caller->getPrimaryTemplate())
+      EmitDeferredDiags(S, Templ->getAsFunction());
+
+    // Add all functions called by Caller to our worklist.
+    auto CGIt = S.CUDACallGraph.find(Caller);
+    if (CGIt == S.CUDACallGraph.end())
+      continue;
+
+    for (FunctionDecl *Callee : CGIt->second) {
+      if (Seen.count(Callee) || IsKnownEmitted(S, Callee))
+        continue;
+      Seen.insert(Callee);
+      Worklist.push_back(Callee);
+    }
+
+    // Caller is now known-emitted, so we no longer need to maintain its list of
+    // callees in CUDACallGraph.
+    S.CUDACallGraph.erase(CGIt);
+  }
+}
+
 bool Sema::CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee) {
   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
   assert(Callee && "Callee may not be null.");
+  // FIXME: Is bailing out early correct here?  Should we instead assume that
+  // the caller is a global initializer?
   FunctionDecl *Caller = dyn_cast<FunctionDecl>(CurContext);
   if (!Caller)
     return true;
 
-  CUDADiagBuilder::Kind DiagKind;
-  switch (IdentifyCUDAPreference(Caller, Callee)) {
-  case CFP_Never:
-    DiagKind = CUDADiagBuilder::K_Immediate;
-    break;
-  case CFP_WrongSide:
-    assert(Caller && "WrongSide calls require a non-null caller");
-    DiagKind = CUDADiagBuilder::K_Deferred;
-    break;
-  default:
-    DiagKind = CUDADiagBuilder::K_Nop;
-  }
+  bool CallerKnownEmitted = IsKnownEmitted(*this, Caller);
+  if (CallerKnownEmitted)
+    MarkKnownEmitted(*this, Callee);
+  else
+    CUDACallGraph[Caller].insert(Callee);
+
+  CUDADiagBuilder::Kind DiagKind = [&] {
+    switch (IdentifyCUDAPreference(Caller, Callee)) {
+    case CFP_Never:
+      return CUDADiagBuilder::K_Immediate;
+    case CFP_WrongSide:
+      assert(Caller && "WrongSide calls require a non-null caller");
+      // If we know the caller will be emitted, we know this wrong-side call
+      // will be emitted, so it's an immediate error.  Otherwise, defer the
+      // error until we know the caller is emitted.
+      return CallerKnownEmitted ? CUDADiagBuilder::K_Immediate
+                                : CUDADiagBuilder::K_Deferred;
+    default:
+      return CUDADiagBuilder::K_Nop;
+    }
+  }();
 
   // Avoid emitting this error twice for the same location.  Using a hashtable
   // like this is unfortunate, but because we must continue parsing as normal
diff --git a/test/Parser/lambda-attr.cu b/test/Parser/lambda-attr.cu
index dfd6fc8ecd..d5aaf01946 100644
--- a/test/Parser/lambda-attr.cu
+++ b/test/Parser/lambda-attr.cu
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -fcuda-is-device -verify %s
 
 __attribute__((device)) void device_fn() {}
-__attribute__((device)) void hd_fn() {}
+__attribute__((host, device)) void hd_fn() {}
 
 __attribute__((device)) void device_attr() {
   ([]() __attribute__((device)) { device_fn(); })();
diff --git a/test/SemaCUDA/call-host-fn-from-device.cu b/test/SemaCUDA/call-host-fn-from-device.cu
index b83beba58d..bb6ea230fa 100644
--- a/test/SemaCUDA/call-host-fn-from-device.cu
+++ b/test/SemaCUDA/call-host-fn-from-device.cu
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 %s --std=c++11 -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm -o - -verify
+// RUN: %clang_cc1 %s --std=c++11 -triple nvptx-unknown-unknown -fcuda-is-device \
+// RUN:   -emit-llvm -o /dev/null -verify
 
 // Note: This test won't work with -fsyntax-only, because some of these errors
 // are emitted during codegen.
diff --git a/test/SemaCUDA/function-overload.cu b/test/SemaCUDA/function-overload.cu
index 11e8bae126..4545a80875 100644
--- a/test/SemaCUDA/function-overload.cu
+++ b/test/SemaCUDA/function-overload.cu
@@ -170,18 +170,35 @@ __host__ __device__ void hostdevicef() {
   DeviceReturnTy ret_d = d();
   DeviceFnPtr fp_cd = cd;
   DeviceReturnTy ret_cd = cd();
+#if !defined(__CUDA_ARCH__)
+  // expected-error@-5 {{reference to __device__ function 'd' in __host__ __device__ function}}
+  // expected-error@-5 {{reference to __device__ function 'd' in __host__ __device__ function}}
+  // expected-error@-5 {{reference to __device__ function 'cd' in __host__ __device__ function}}
+  // expected-error@-5 {{reference to __device__ function 'cd' in __host__ __device__ function}}
+#endif
 
   HostFnPtr fp_h = h;
   HostReturnTy ret_h = h();
   HostFnPtr fp_ch = ch;
   HostReturnTy ret_ch = ch();
+#if defined(__CUDA_ARCH__)
+  // expected-error@-5 {{reference to __host__ function 'h' in __host__ __device__ function}}
+  // expected-error@-5 {{reference to __host__ function 'h' in __host__ __device__ function}}
+  // expected-error@-5 {{reference to __host__ function 'ch' in __host__ __device__ function}}
+  // expected-error@-5 {{reference to __host__ function 'ch' in __host__ __device__ function}}
+#endif
 
   CurrentFnPtr fp_dh = dh;
   CurrentReturnTy ret_dh = dh();
   CurrentFnPtr fp_cdh = cdh;
   CurrentReturnTy ret_cdh = cdh();
 
-  g(); // expected-error {{call to global function g not configured}}
+  g();
+#if defined (__CUDA_ARCH__)
+  // expected-error@-2 {{reference to __global__ function 'g' in __host__ __device__ function}}
+#else
+  // expected-error@-4 {{call to global function g not configured}}
+#endif
 }
 
 // Test for address of overloaded function resolution in the global context.
@@ -297,7 +314,11 @@ __device__ void test_device_calls_template_fn() {
 
 // If we have a mix of HD and H-only or D-only candidates in the overload set,
 // normal C++ overload resolution rules apply first.
-template <typename T> TemplateReturnTy template_vs_hd_function(T arg) {
+template <typename T> TemplateReturnTy template_vs_hd_function(T arg)
+#ifdef __CUDA_ARCH__
+//expected-note@-2 {{declared here}}
+#endif
+{
   return TemplateReturnTy();
 }
 __host__ __device__ HostDeviceReturnTy template_vs_hd_function(float arg) {
@@ -307,6 +328,9 @@ __host__ __device__ HostDeviceReturnTy template_vs_hd_function(float arg) {
 __host__ __device__ void test_host_device_calls_hd_template() {
   HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f);
   TemplateReturnTy ret2 = template_vs_hd_function(1);
+#ifdef __CUDA_ARCH__
+  // expected-error@-2 {{reference to __host__ function 'template_vs_hd_function<int>' in __host__ __device__ function}}
+#endif
 }
 
 __host__ void test_host_calls_hd_template() {
@@ -326,14 +350,30 @@ __device__ void test_device_calls_hd_template() {
 // side of compilation.
 __device__ DeviceReturnTy device_only_function(int arg) { return DeviceReturnTy(); }
 __device__ DeviceReturnTy2 device_only_function(float arg) { return DeviceReturnTy2(); }
+#ifndef __CUDA_ARCH__
+  // expected-note@-3 {{'device_only_function' declared here}}
+  // expected-note@-3 {{'device_only_function' declared here}}
+#endif
 __host__ HostReturnTy host_only_function(int arg) { return HostReturnTy(); }
 __host__ HostReturnTy2 host_only_function(float arg) { return HostReturnTy2(); }
+#ifdef __CUDA_ARCH__
+  // expected-note@-3 {{'host_only_function' declared here}}
+  // expected-note@-3 {{'host_only_function' declared here}}
+#endif
 
 __host__ __device__ void test_host_device_single_side_overloading() {
   DeviceReturnTy ret1 = device_only_function(1);
   DeviceReturnTy2 ret2 = device_only_function(1.0f);
+#ifndef __CUDA_ARCH__
+  // expected-error@-3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}}
+  // expected-error@-3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}}
+#endif
   HostReturnTy ret3 = host_only_function(1);
   HostReturnTy2 ret4 = host_only_function(1.0f);
+#ifdef __CUDA_ARCH__
+  // expected-error@-3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}}
+  // expected-error@-3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}}
+#endif
 }
 
 // Verify that we allow overloading function templates.
diff --git a/test/SemaCUDA/method-target.cu b/test/SemaCUDA/method-target.cu
index 505664586b..8e17daa0c1 100644
--- a/test/SemaCUDA/method-target.cu
+++ b/test/SemaCUDA/method-target.cu
@@ -29,7 +29,7 @@ __device__ void foo2(S2& s, int i, float f) {
 // Test 3: device method called from host function
 
 struct S3 {
-  __device__ void method() {} // expected-note {{'method' declared here}};
+  __device__ void method() {} // expected-note {{'method' declared here}}
 };
 
 void foo3(S3& s) {
@@ -40,11 +40,11 @@ void foo3(S3& s) {
 // Test 4: device method called from host&device function
 
 struct S4 {
-  __device__ void method() {}
+  __device__ void method() {}  // expected-note {{'method' declared here}}
 };
 
 __host__ __device__ void foo4(S4& s) {
-  s.method();
+  s.method(); // expected-error {{reference to __device__ function 'method' in __host__ __device__ function}}
 }
 
 //------------------------------------------------------------------------------
diff --git a/test/SemaCUDA/reference-to-kernel-fn.cu b/test/SemaCUDA/reference-to-kernel-fn.cu
index 2d25dde94f..e502d134b0 100644
--- a/test/SemaCUDA/reference-to-kernel-fn.cu
+++ b/test/SemaCUDA/reference-to-kernel-fn.cu
@@ -18,10 +18,7 @@ typedef void (*fn_ptr_t)();
 __host__ __device__ fn_ptr_t get_ptr_hd() {
   return kernel;
 #ifdef DEVICE
-  // This emits a deferred error on the device, but we don't catch it in this
-  // file because the non-deferred error below precludes this.
-
-  // FIXME-expected-error@-2 {{reference to __global__ function}}
+  // expected-error@-2 {{reference to __global__ function}}
 #endif
 }
 __host__ fn_ptr_t get_ptr_h() {
-- 
2.40.0