From bf32d4a7db9373ef41d77bcac0bddca5b663013a Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Wed, 19 Oct 2016 21:15:01 +0000
Subject: [PATCH] [CUDA] When we emit an error that might have been deferred,
 also print a callstack.

Summary:
Previously, when you did something not allowed in a host+device function
and then caused it to be codegen'ed, we would print out an error telling
you that you did something bad, but we wouldn't tell you how we decided
that the function needed to be codegen'ed.

This change causes us to print out a callstack when emitting deferred
errors.  This is immensely helpful when debugging highly-templated code,
where it's often unclear how a function became known-emitted.

We only print the callstack once per function, after we print the all
deferred errors.

This patch also switches all of our hashtables to using canonical
FunctionDecls instead of regular FunctionDecls.  This prevents a number
of bugs, some of which are caught by tests added here, in which we
assume that two FDs for the same function have the same pointer value.

Reviewers: rnk

Subscribers: cfe-commits, tra

Differential Revision: https://reviews.llvm.org/D25704

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@284647 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/clang/Basic/DiagnosticSemaKinds.td    |   1 +
 include/clang/Sema/Sema.h                     |  81 +++++-----
 lib/Sema/SemaCUDA.cpp                         | 142 ++++++++++++------
 test/SemaCUDA/bad-calls-on-same-line.cu       |   2 +
 test/SemaCUDA/call-device-fn-from-host.cu     |   3 +-
 test/SemaCUDA/call-host-fn-from-device.cu     |   2 +-
 test/SemaCUDA/call-stack-for-deferred-err.cu  |  18 +++
 test/SemaCUDA/exceptions.cu                   |   3 +
 .../no-call-stack-for-immediate-errs.cu       |  17 +++
 test/SemaCUDA/trace-through-global.cu         |  10 +-
 10 files changed, 197 insertions(+), 82 deletions(-)
 create mode 100644 test/SemaCUDA/call-stack-for-deferred-err.cu
 create mode 100644 test/SemaCUDA/no-call-stack-for-immediate-errs.cu
diff --git a/include/clang/Basic/DiagnosticSemaKinds.td b/include/clang/Basic/DiagnosticSemaKinds.td
index 407b84f261..ede1d9e3a0 100644
--- a/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6702,6 +6702,7 @@ def err_deleted_function_use : Error<"attempt to use a deleted function">;
 def err_deleted_inherited_ctor_use : Error<
   "constructor inherited by %0 from base class %1 is implicitly deleted">;
 
+def note_called_by : Note<"called by %0">;
 def err_kern_type_not_void_return : Error<
   "kernel function type %0 must have void return type">;
 def err_kern_is_nonstatic_method : Error<
diff --git a/include/clang/Sema/Sema.h b/include/clang/Sema/Sema.h
index 627aa5bfb3..78080f5e4a 100644
--- a/include/clang/Sema/Sema.h
+++ b/include/clang/Sema/Sema.h
@@ -9249,26 +9249,42 @@ public:
   /// Diagnostics that are emitted only if we discover that the given function
   /// must be codegen'ed.  Because handling these correctly adds overhead to
   /// compilation, this is currently only enabled for CUDA compilations.
-  llvm::DenseMap<const FunctionDecl *, std::vector<PartialDiagnosticAt>>
+  llvm::DenseMap<CanonicalDeclPtr<FunctionDecl>,
+                 std::vector<PartialDiagnosticAt>>
       CUDADeferredDiags;
 
   /// FunctionDecls plus raw encodings of SourceLocations for which
   /// CheckCUDACall has emitted a (maybe deferred) "bad call" diagnostic.  We
   /// use this to avoid emitting the same deferred diag twice.
-  llvm::DenseSet<std::pair<FunctionDecl *, unsigned>> LocsWithCUDACallDiags;
+  llvm::DenseSet<std::pair<CanonicalDeclPtr<FunctionDecl>, unsigned>>
+      LocsWithCUDACallDiags;
 
-  /// The set of CUDA functions that we've discovered must be emitted by tracing
-  /// the call graph.  Functions that we can tell a priori must be emitted
-  /// aren't added to this set.
-  llvm::DenseSet<FunctionDecl *> CUDAKnownEmittedFns;
+  /// A pair of a canonical FunctionDecl and a SourceLocation.
+  struct FunctionDeclAndLoc {
+    CanonicalDeclPtr<FunctionDecl> FD;
+    SourceLocation Loc;
+  };
+
+  /// An inverse call graph, mapping known-emitted functions to one of their
+  /// known-emitted callers (plus the location of the call).
+  ///
+  /// Functions that we can tell a priori must be emitted aren't added to this
+  /// map.
+  llvm::DenseMap</* Callee = */ CanonicalDeclPtr<FunctionDecl>,
+                 /* Caller = */ FunctionDeclAndLoc>
+      CUDAKnownEmittedFns;
 
   /// A partial call graph maintained during CUDA compilation to support
-  /// deferred diagnostics.  Specifically, functions are only added here if, at
-  /// the time they're added, they are not known-emitted.  As soon as we
-  /// discover that a function is known-emitted, we remove it and everything it
-  /// transitively calls from this set and add those functions to
-  /// CUDAKnownEmittedFns.
-  llvm::DenseMap<FunctionDecl *, llvm::SetVector<FunctionDecl *>> CUDACallGraph;
+  /// deferred diagnostics.
+  ///
+  /// Functions are only added here if, at the time they're considered, they are
+  /// not known-emitted.  As soon as we discover that a function is
+  /// known-emitted, we remove it and everything it transitively calls from this
+  /// set and add those functions to CUDAKnownEmittedFns.
+  llvm::DenseMap</* Caller = */ CanonicalDeclPtr<FunctionDecl>,
+                 /* Callees = */ llvm::MapVector<CanonicalDeclPtr<FunctionDecl>,
+                                                 SourceLocation>>
+      CUDACallGraph;
 
   /// Diagnostic builder for CUDA errors which may or may not be deferred.
   ///
@@ -9291,13 +9307,19 @@ public:
       K_Nop,
       /// Emit the diagnostic immediately (i.e., behave like Sema::Diag()).
       K_Immediate,
+      /// Emit the diagnostic immediately, and, if it's a warning or error, also
+      /// emit a call stack showing how this function can be reached by an a
+      /// priori known-emitted function.
+      K_ImmediateWithCallStack,
       /// Create a deferred diagnostic, which is emitted only if the function
-      /// it's attached to is codegen'ed.
+      /// it's attached to is codegen'ed.  Also emit a call stack as with
+      /// K_ImmediateWithCallStack.
       K_Deferred
     };
 
     CUDADiagBuilder(Kind K, SourceLocation Loc, unsigned DiagID,
                     FunctionDecl *Fn, Sema &S);
+    ~CUDADiagBuilder();
 
     /// Convertible to bool: True if we immediately emitted an error, false if
     /// we didn't emit an error or we created a deferred error.
@@ -9309,38 +9331,29 @@ public:
     ///
     /// But see CUDADiagIfDeviceCode() and CUDADiagIfHostCode() -- you probably
     /// want to use these instead of creating a CUDADiagBuilder yourself.
-    operator bool() const { return ImmediateDiagBuilder.hasValue(); }
+    operator bool() const { return ImmediateDiag.hasValue(); }
 
     template <typename T>
     friend const CUDADiagBuilder &operator<<(const CUDADiagBuilder &Diag,
                                              const T &Value) {
-      if (Diag.ImmediateDiagBuilder.hasValue())
-        *Diag.ImmediateDiagBuilder << Value;
-      else if (Diag.PartialDiagInfo.hasValue())
-        Diag.PartialDiagInfo->PD << Value;
+      if (Diag.ImmediateDiag.hasValue())
+        *Diag.ImmediateDiag << Value;
+      else if (Diag.PartialDiag.hasValue())
+        *Diag.PartialDiag << Value;
       return Diag;
     }
 
   private:
-    struct PartialDiagnosticInfo {
-      PartialDiagnosticInfo(Sema &S, SourceLocation Loc, PartialDiagnostic PD,
-                            FunctionDecl *Fn)
-          : S(S), Loc(Loc), PD(std::move(PD)), Fn(Fn) {}
-
-      ~PartialDiagnosticInfo() {
-        S.CUDADeferredDiags[Fn].push_back({Loc, std::move(PD)});
-      }
-
-      Sema &S;
-      SourceLocation Loc;
-      PartialDiagnostic PD;
-      FunctionDecl *Fn;
-    };
+    Sema &S;
+    SourceLocation Loc;
+    unsigned DiagID;
+    FunctionDecl *Fn;
+    bool ShowCallStack;
 
     // Invariant: At most one of these Optionals has a value.
     // FIXME: Switch these to a Variant once that exists.
-    llvm::Optional<Sema::SemaDiagnosticBuilder> ImmediateDiagBuilder;
-    llvm::Optional<PartialDiagnosticInfo> PartialDiagInfo;
+    llvm::Optional<SemaDiagnosticBuilder> ImmediateDiag;
+    llvm::Optional<PartialDiagnostic> PartialDiag;
   };
 
   /// Creates a CUDADiagBuilder that emits the diagnostic if the current context
diff --git a/lib/Sema/SemaCUDA.cpp b/lib/Sema/SemaCUDA.cpp
index 423aef370b..7e05cc86ba 100644
--- a/lib/Sema/SemaCUDA.cpp
+++ b/lib/Sema/SemaCUDA.cpp
@@ -488,22 +488,6 @@ void Sema::maybeAddCUDAHostDeviceAttrs(Scope *S, FunctionDecl *NewD,
   NewD->addAttr(CUDADeviceAttr::CreateImplicit(Context));
 }
 
-Sema::CUDADiagBuilder::CUDADiagBuilder(Kind K, SourceLocation Loc,
-                                       unsigned DiagID, FunctionDecl *Fn,
-                                       Sema &S) {
-  switch (K) {
-  case K_Nop:
-    break;
-  case K_Immediate:
-    ImmediateDiagBuilder.emplace(S.Diag(Loc, DiagID));
-    break;
-  case K_Deferred:
-    assert(Fn && "Must have a function to attach the deferred diag to.");
-    PartialDiagInfo.emplace(S, Loc, S.PDiag(DiagID), Fn);
-    break;
-  }
-}
-
 // In CUDA, there are some constructs which may appear in semantically-valid
 // code, but trigger errors if we ever generate code for the function in which
 // they appear.  Essentially every construct you're not allowed to use on the
@@ -528,6 +512,54 @@ Sema::CUDADiagBuilder::CUDADiagBuilder(Kind K, SourceLocation Loc,
 // until we discover that the function is known-emitted, at which point we take
 // it out of this map and emit the diagnostic.
 
+Sema::CUDADiagBuilder::CUDADiagBuilder(Kind K, SourceLocation Loc,
+                                       unsigned DiagID, FunctionDecl *Fn,
+                                       Sema &S)
+    : S(S), Loc(Loc), DiagID(DiagID), Fn(Fn),
+      ShowCallStack(K == K_ImmediateWithCallStack || K == K_Deferred) {
+  switch (K) {
+  case K_Nop:
+    break;
+  case K_Immediate:
+  case K_ImmediateWithCallStack:
+    ImmediateDiag.emplace(S.Diag(Loc, DiagID));
+    break;
+  case K_Deferred:
+    assert(Fn && "Must have a function to attach the deferred diag to.");
+    PartialDiag.emplace(S.PDiag(DiagID));
+    break;
+  }
+}
+
+// Print notes showing how we can reach FD starting from an a priori
+// known-callable function.
+static void EmitCallStackNotes(Sema &S, FunctionDecl *FD) {
+  auto FnIt = S.CUDAKnownEmittedFns.find(FD);
+  while (FnIt != S.CUDAKnownEmittedFns.end()) {
+    DiagnosticBuilder Builder(
+        S.Diags.Report(FnIt->second.Loc, diag::note_called_by));
+    Builder << FnIt->second.FD;
+    Builder.setForceEmit();
+
+    FnIt = S.CUDAKnownEmittedFns.find(FnIt->second.FD);
+  }
+}
+
+Sema::CUDADiagBuilder::~CUDADiagBuilder() {
+  if (ImmediateDiag) {
+    // Emit our diagnostic and, if it was a warning or error, output a callstack
+    // if Fn isn't a priori known-emitted.
+    bool IsWarningOrError = S.getDiagnostics().getDiagnosticLevel(
+                                DiagID, Loc) >= DiagnosticsEngine::Warning;
+    ImmediateDiag.reset(); // Emit the immediate diag.
+    if (IsWarningOrError && ShowCallStack)
+      EmitCallStackNotes(S, Fn);
+  } else if (PartialDiag) {
+    assert(ShowCallStack && "Must always show call stack for deferred diags.");
+    S.CUDADeferredDiags[Fn].push_back({Loc, std::move(*PartialDiag)});
+  }
+}
+
 // Do we know that we will eventually codegen the given function?
 static bool IsKnownEmitted(Sema &S, FunctionDecl *FD) {
   // Templates are emitted when they're instantiated.
@@ -568,7 +600,7 @@ Sema::CUDADiagBuilder Sema::CUDADiagIfDeviceCode(SourceLocation Loc,
       // mode until the function is known-emitted.
       if (getLangOpts().CUDAIsDevice) {
         return IsKnownEmitted(*this, dyn_cast<FunctionDecl>(CurContext))
-                   ? CUDADiagBuilder::K_Immediate
+                   ? CUDADiagBuilder::K_ImmediateWithCallStack
                    : CUDADiagBuilder::K_Deferred;
       }
       return CUDADiagBuilder::K_Nop;
@@ -596,7 +628,7 @@ Sema::CUDADiagBuilder Sema::CUDADiagIfHostCode(SourceLocation Loc,
         return CUDADiagBuilder::K_Nop;
 
       return IsKnownEmitted(*this, dyn_cast<FunctionDecl>(CurContext))
-                 ? CUDADiagBuilder::K_Immediate
+                 ? CUDADiagBuilder::K_ImmediateWithCallStack
                  : CUDADiagBuilder::K_Deferred;
     default:
       return CUDADiagBuilder::K_Nop;
@@ -612,63 +644,84 @@ static void EmitDeferredDiags(Sema &S, FunctionDecl *FD) {
   auto It = S.CUDADeferredDiags.find(FD);
   if (It == S.CUDADeferredDiags.end())
     return;
+  bool HasWarningOrError = false;
   for (PartialDiagnosticAt &PDAt : It->second) {
     const SourceLocation &Loc = PDAt.first;
     const PartialDiagnostic &PD = PDAt.second;
+    HasWarningOrError |= S.getDiagnostics().getDiagnosticLevel(
+                             PD.getDiagID(), Loc) >= DiagnosticsEngine::Warning;
     DiagnosticBuilder Builder(S.Diags.Report(Loc, PD.getDiagID()));
     Builder.setForceEmit();
     PD.Emit(Builder);
   }
   S.CUDADeferredDiags.erase(It);
+
+  // FIXME: Should this be called after every warning/error emitted in the loop
+  // above, instead of just once per function?  That would be consistent with
+  // how we handle immediate errors, but it also seems like a bit much.
+  if (HasWarningOrError)
+    EmitCallStackNotes(S, FD);
 }
 
 // Indicate that this function (and thus everything it transtively calls) will
 // be codegen'ed, and emit any deferred diagnostics on this function and its
 // (transitive) callees.
-static void MarkKnownEmitted(Sema &S, FunctionDecl *FD) {
+static void MarkKnownEmitted(Sema &S, FunctionDecl *OrigCaller,
+                             FunctionDecl *OrigCallee, SourceLocation OrigLoc) {
   // Nothing to do if we already know that FD is emitted.
-  if (IsKnownEmitted(S, FD)) {
-    assert(!S.CUDACallGraph.count(FD));
+  if (IsKnownEmitted(S, OrigCallee)) {
+    assert(!S.CUDACallGraph.count(OrigCallee));
     return;
   }
 
-  // We've just discovered that FD is known-emitted.  Walk our call graph to see
-  // what else we can now discover also must be emitted.
-  llvm::SmallVector<FunctionDecl *, 4> Worklist = {FD};
-  llvm::SmallSet<FunctionDecl *, 4> Seen;
-  Seen.insert(FD);
+  // We've just discovered that OrigCallee is known-emitted.  Walk our call
+  // graph to see what else we can now discover also must be emitted.
+
+  struct CallInfo {
+    FunctionDecl *Caller;
+    FunctionDecl *Callee;
+    SourceLocation Loc;
+  };
+  llvm::SmallVector<CallInfo, 4> Worklist = {{OrigCaller, OrigCallee, OrigLoc}};
+  llvm::SmallSet<CanonicalDeclPtr<FunctionDecl>, 4> Seen;
+  Seen.insert(OrigCallee);
   while (!Worklist.empty()) {
-    FunctionDecl *Caller = Worklist.pop_back_val();
-    assert(!IsKnownEmitted(S, Caller) &&
+    CallInfo C = Worklist.pop_back_val();
+    assert(!IsKnownEmitted(S, C.Callee) &&
            "Worklist should not contain known-emitted functions.");
-    S.CUDAKnownEmittedFns.insert(Caller);
-    EmitDeferredDiags(S, Caller);
+    S.CUDAKnownEmittedFns[C.Callee] = {C.Caller, C.Loc};
+    EmitDeferredDiags(S, C.Callee);
 
     // If this is a template instantiation, explore its callgraph as well:
     // Non-dependent calls are part of the template's callgraph, while dependent
     // calls are part of to the instantiation's call graph.
-    if (auto *Templ = Caller->getPrimaryTemplate()) {
+    if (auto *Templ = C.Callee->getPrimaryTemplate()) {
       FunctionDecl *TemplFD = Templ->getAsFunction();
       if (!Seen.count(TemplFD) && !S.CUDAKnownEmittedFns.count(TemplFD)) {
         Seen.insert(TemplFD);
-        Worklist.push_back(TemplFD);
+        Worklist.push_back(
+            {/* Caller = */ C.Caller, /* Callee = */ TemplFD, C.Loc});
       }
     }
 
-    // Add all functions called by Caller to our worklist.
-    auto CGIt = S.CUDACallGraph.find(Caller);
+    // Add all functions called by Callee to our worklist.
+    auto CGIt = S.CUDACallGraph.find(C.Callee);
     if (CGIt == S.CUDACallGraph.end())
       continue;
 
-    for (FunctionDecl *Callee : CGIt->second) {
-      if (Seen.count(Callee) || IsKnownEmitted(S, Callee))
+    for (std::pair<CanonicalDeclPtr<FunctionDecl>, SourceLocation> FDLoc :
+         CGIt->second) {
+      FunctionDecl *NewCallee = FDLoc.first;
+      SourceLocation CallLoc = FDLoc.second;
+      if (Seen.count(NewCallee) || IsKnownEmitted(S, NewCallee))
         continue;
-      Seen.insert(Callee);
-      Worklist.push_back(Callee);
+      Seen.insert(NewCallee);
+      Worklist.push_back(
+          {/* Caller = */ C.Callee, /* Callee = */ NewCallee, CallLoc});
     }
 
-    // Caller is now known-emitted, so we no longer need to maintain its list of
-    // callees in CUDACallGraph.
+    // C.Callee is now known-emitted, so we no longer need to maintain its list
+    // of callees in CUDACallGraph.
     S.CUDACallGraph.erase(CGIt);
   }
 }
@@ -686,7 +739,7 @@ bool Sema::CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee) {
   // Otherwise, mark the call in our call graph so we can traverse it later.
   bool CallerKnownEmitted = IsKnownEmitted(*this, Caller);
   if (CallerKnownEmitted)
-    MarkKnownEmitted(*this, Callee);
+    MarkKnownEmitted(*this, Caller, Callee, Loc);
   else {
     // If we have
     //   host fn calls kernel fn calls host+device,
@@ -695,7 +748,7 @@ bool Sema::CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee) {
     // that, when compiling for host, only HD functions actually called from the
     // host get marked as known-emitted.
     if (getLangOpts().CUDAIsDevice || IdentifyCUDATarget(Callee) != CFT_Global)
-      CUDACallGraph[Caller].insert(Callee);
+      CUDACallGraph[Caller].insert({Callee, Loc});
   }
 
   CUDADiagBuilder::Kind DiagKind = [&] {
@@ -707,7 +760,7 @@ bool Sema::CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee) {
       // If we know the caller will be emitted, we know this wrong-side call
       // will be emitted, so it's an immediate error.  Otherwise, defer the
       // error until we know the caller is emitted.
-      return CallerKnownEmitted ? CUDADiagBuilder::K_Immediate
+      return CallerKnownEmitted ? CUDADiagBuilder::K_ImmediateWithCallStack
                                 : CUDADiagBuilder::K_Deferred;
     default:
       return CUDADiagBuilder::K_Nop;
@@ -729,7 +782,8 @@ bool Sema::CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee) {
   CUDADiagBuilder(DiagKind, Callee->getLocation(), diag::note_previous_decl,
                   Caller, *this)
       << Callee;
-  return DiagKind != CUDADiagBuilder::K_Immediate;
+  return DiagKind != CUDADiagBuilder::K_Immediate &&
+         DiagKind != CUDADiagBuilder::K_ImmediateWithCallStack;
 }
 
 void Sema::CUDASetLambdaAttrs(CXXMethodDecl *Method) {
diff --git a/test/SemaCUDA/bad-calls-on-same-line.cu b/test/SemaCUDA/bad-calls-on-same-line.cu
index e91baff5d2..53d5e12823 100644
--- a/test/SemaCUDA/bad-calls-on-same-line.cu
+++ b/test/SemaCUDA/bad-calls-on-same-line.cu
@@ -35,5 +35,7 @@ inline __host__ __device__ void hd() {
 void host_fn() {
   hd<int>();
   hd<double>();  // expected-note {{function template specialization 'hd<double>'}}
+  // expected-note@-1 {{called by 'host_fn'}}
   hd<float>();  // expected-note {{function template specialization 'hd<float>'}}
+  // expected-note@-1 {{called by 'host_fn'}}
 }
diff --git a/test/SemaCUDA/call-device-fn-from-host.cu b/test/SemaCUDA/call-device-fn-from-host.cu
index ab88338b80..dc22722610 100644
--- a/test/SemaCUDA/call-device-fn-from-host.cu
+++ b/test/SemaCUDA/call-device-fn-from-host.cu
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 %s --std=c++11 -triple x86_64-unknown-linux -emit-llvm -o - -verify
+// RUN: %clang_cc1 %s --std=c++11 -triple x86_64-unknown-linux -emit-llvm -o - \
+// RUN:   -verify -verify-ignore-unexpected=note
 
 // Note: This test won't work with -fsyntax-only, because some of these errors
 // are emitted during codegen.
diff --git a/test/SemaCUDA/call-host-fn-from-device.cu b/test/SemaCUDA/call-host-fn-from-device.cu
index bb6ea230fa..d484af1417 100644
--- a/test/SemaCUDA/call-host-fn-from-device.cu
+++ b/test/SemaCUDA/call-host-fn-from-device.cu
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s --std=c++11 -triple nvptx-unknown-unknown -fcuda-is-device \
-// RUN:   -emit-llvm -o /dev/null -verify
+// RUN:   -emit-llvm -o /dev/null -verify -verify-ignore-unexpected=note
 
 // Note: This test won't work with -fsyntax-only, because some of these errors
 // are emitted during codegen.
diff --git a/test/SemaCUDA/call-stack-for-deferred-err.cu b/test/SemaCUDA/call-stack-for-deferred-err.cu
new file mode 100644
index 0000000000..ddcaabf4ef
--- /dev/null
+++ b/test/SemaCUDA/call-stack-for-deferred-err.cu
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -fcuda-is-device -fsyntax-only -verify %s
+
+#include "Inputs/cuda.h"
+
+// We should emit an error for hd_fn's use of a VLA.  This would have been
+// legal if hd_fn were never codegen'ed on the device, so we should also print
+// out a callstack showing how we determine that hd_fn is known-emitted.
+//
+// Compare to no-call-stack-for-deferred-err.cu.
+
+inline __host__ __device__ void hd_fn(int n);
+inline __device__ void device_fn2() { hd_fn(42); } // expected-note {{called by 'device_fn2'}}
+
+__global__ void kernel() { device_fn2(); } // expected-note {{called by 'kernel'}}
+
+inline __host__ __device__ void hd_fn(int n) {
+  int vla[n]; // expected-error {{variable-length array}}
+}
diff --git a/test/SemaCUDA/exceptions.cu b/test/SemaCUDA/exceptions.cu
index 73d2b9d084..49568ecac7 100644
--- a/test/SemaCUDA/exceptions.cu
+++ b/test/SemaCUDA/exceptions.cu
@@ -50,3 +50,6 @@ inline __host__ __device__ void hd3() {
 }
 
 __device__ void call_hd3() { hd3(); }
+#ifdef __CUDA_ARCH__
+// expected-note@-2 {{called by 'call_hd3'}}
+#endif
diff --git a/test/SemaCUDA/no-call-stack-for-immediate-errs.cu b/test/SemaCUDA/no-call-stack-for-immediate-errs.cu
new file mode 100644
index 0000000000..6dc98695c1
--- /dev/null
+++ b/test/SemaCUDA/no-call-stack-for-immediate-errs.cu
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -fcuda-is-device -fsyntax-only -verify %s
+
+#include "Inputs/cuda.h"
+
+// Here we should dump an error about the VLA in device_fn, but we should not
+// print a callstack indicating how device_fn becomes known-emitted, because
+// it's an error to use a VLA in any __device__ function, even one that doesn't
+// get emitted.
+
+inline __device__ void device_fn(int n);
+inline __device__ void device_fn2() { device_fn(42); }
+
+__global__ void kernel() { device_fn2(); }
+
+inline __device__ void device_fn(int n) {
+  int vla[n]; // expected-error {{variable-length array}}
+}
diff --git a/test/SemaCUDA/trace-through-global.cu b/test/SemaCUDA/trace-through-global.cu
index 7a9b8dc72b..065342fdd1 100644
--- a/test/SemaCUDA/trace-through-global.cu
+++ b/test/SemaCUDA/trace-through-global.cu
@@ -35,10 +35,16 @@ __global__ void kernel(int) { hd2(); }
 template <typename T>
 void launch_kernel() {
   kernel<<<0, 0>>>(T());
-  hd1();
-  hd3(T());
+
+  // Notice that these two diagnostics are different: Because the call to hd1
+  // is not dependent on T, the call to hd1 comes from 'launch_kernel', while
+  // the call to hd3, being dependent, comes from 'launch_kernel<int>'.
+  hd1(); // expected-note {{called by 'launch_kernel'}}
+  hd3(T()); // expected-note {{called by 'launch_kernel<int>'}}
 }
 
 void host_fn() {
   launch_kernel<int>();
+  // expected-note@-1 {{called by 'host_fn'}}
+  // expected-note@-2 {{called by 'host_fn'}}
 }
-- 
2.40.0