From c452e7c5b13885446155f8ed4371a2656b545ac8 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Mon, 7 May 2018 14:50:05 +0000
Subject: [PATCH] [OPENMP, NVPTX] Added support for L2 parallelism.

Added initial codegen for level 2, 3 etc. parallelism. Currently, all
the second, the third etc. parallel regions will run sequentially.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@331642 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/CGOpenMPRuntime.cpp               |   7 -
 lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp          | 443 ++++++++++++++----
 lib/CodeGen/CGOpenMPRuntimeNVPTX.h            |  48 +-
 lib/CodeGen/CodeGenModule.cpp                 |  13 +-
 test/OpenMP/nvptx_target_codegen.cpp          |  88 +++-
 ...bute_parallel_for_generic_mode_codegen.cpp |  55 +--
 6 files changed, 459 insertions(+), 195 deletions(-)

diff --git a/lib/CodeGen/CGOpenMPRuntime.cpp b/lib/CodeGen/CGOpenMPRuntime.cpp
index 18b51ea0bc..9e14738dd8 100644
--- a/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -2764,13 +2764,6 @@ Address CGOpenMPRuntime::getAddrOfArtificialThreadPrivate(CodeGenFunction &CGF,
       CGM.getPointerAlign());
 }
 
-/// \brief Emits code for OpenMP 'if' clause using specified \a CodeGen
-/// function. Here is the logic:
-/// if (Cond) {
-///   ThenGen();
-/// } else {
-///   ElseGen();
-/// }
 void CGOpenMPRuntime::emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond,
                                       const RegionCodeGenTy &ThenGen,
                                       const RegionCodeGenTy &ElseGen) {
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index 82be31f0f8..9e6f2b4b9a 100644
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -93,6 +93,9 @@ enum OpenMPRTLFunctionNVPTX {
   OMPRTL_NVPTX__kmpc_end_sharing_variables,
   /// \brief Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
   OMPRTL_NVPTX__kmpc_get_shared_variables,
+  /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
+  /// global_tid);
+  OMPRTL_NVPTX__kmpc_parallel_level,
 };
 
 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
@@ -131,19 +134,17 @@ public:
   }
 };
 
-// A class to track the execution mode when codegening directives within
-// a target region. The appropriate mode (generic/spmd) is set on entry
-// to the target region and used by containing directives such as 'parallel'
-// to emit optimized code.
+/// A class to track the execution mode when codegening directives within
+/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
+/// to the target region and used by containing directives such as 'parallel'
+/// to emit optimized code.
 class ExecutionModeRAII {
 private:
-  CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
-  CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
+  bool SavedMode;
+  bool &Mode;
 
 public:
-  ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
-                    CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)
-      : Mode(Mode) {
+  ExecutionModeRAII(bool &Mode, bool NewMode) : Mode(Mode) {
     SavedMode = Mode;
     Mode = NewMode;
   }
@@ -579,24 +580,171 @@ void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
 }
 
 bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
-  return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
+  return IsInSPMDExecutionMode;
+}
+
+static CGOpenMPRuntimeNVPTX::DataSharingMode
+getDataSharingMode(CodeGenModule &CGM) {
+  return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
+                                          : CGOpenMPRuntimeNVPTX::Generic;
 }
 
-static CGOpenMPRuntimeNVPTX::ExecutionMode
-getExecutionMode(CodeGenModule &CGM) {
-  return CGM.getLangOpts().OpenMPCUDAMode
-             ? CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd
-             : CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
+/// Check for inner (nested) SPMD construct, if any
+static bool hasNestedSPMDDirective(const OMPExecutableDirective &D) {
+  const auto *CS = D.getCapturedStmt(OMPD_target);
+  const auto *Body = CS->getCapturedStmt()->IgnoreContainers();
+  const Stmt *ChildStmt = nullptr;
+  if (const auto *C = dyn_cast<CompoundStmt>(Body))
+    if (C->size() == 1)
+      ChildStmt = C->body_front();
+  if (!ChildStmt)
+    return false;
+
+  if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
+    OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
+    // TODO: add further analysis for inner teams|distribute directives, if any.
+    switch (D.getDirectiveKind()) {
+    case OMPD_target:
+      return (isOpenMPParallelDirective(DKind) &&
+              !isOpenMPTeamsDirective(DKind) &&
+              !isOpenMPDistributeDirective(DKind)) ||
+             isOpenMPSimdDirective(DKind) ||
+             DKind == OMPD_teams_distribute_parallel_for;
+    case OMPD_target_teams:
+      return (isOpenMPParallelDirective(DKind) &&
+              !isOpenMPDistributeDirective(DKind)) ||
+             isOpenMPSimdDirective(DKind) ||
+             DKind == OMPD_distribute_parallel_for;
+    case OMPD_target_teams_distribute:
+      return isOpenMPParallelDirective(DKind) || isOpenMPSimdDirective(DKind);
+    case OMPD_target_simd:
+    case OMPD_target_parallel:
+    case OMPD_target_parallel_for:
+    case OMPD_target_parallel_for_simd:
+    case OMPD_target_teams_distribute_simd:
+    case OMPD_target_teams_distribute_parallel_for:
+    case OMPD_target_teams_distribute_parallel_for_simd:
+    case OMPD_parallel:
+    case OMPD_for:
+    case OMPD_parallel_for:
+    case OMPD_parallel_sections:
+    case OMPD_for_simd:
+    case OMPD_parallel_for_simd:
+    case OMPD_cancel:
+    case OMPD_cancellation_point:
+    case OMPD_ordered:
+    case OMPD_threadprivate:
+    case OMPD_task:
+    case OMPD_simd:
+    case OMPD_sections:
+    case OMPD_section:
+    case OMPD_single:
+    case OMPD_master:
+    case OMPD_critical:
+    case OMPD_taskyield:
+    case OMPD_barrier:
+    case OMPD_taskwait:
+    case OMPD_taskgroup:
+    case OMPD_atomic:
+    case OMPD_flush:
+    case OMPD_teams:
+    case OMPD_target_data:
+    case OMPD_target_exit_data:
+    case OMPD_target_enter_data:
+    case OMPD_distribute:
+    case OMPD_distribute_simd:
+    case OMPD_distribute_parallel_for:
+    case OMPD_distribute_parallel_for_simd:
+    case OMPD_teams_distribute:
+    case OMPD_teams_distribute_simd:
+    case OMPD_teams_distribute_parallel_for:
+    case OMPD_teams_distribute_parallel_for_simd:
+    case OMPD_target_update:
+    case OMPD_declare_simd:
+    case OMPD_declare_target:
+    case OMPD_end_declare_target:
+    case OMPD_declare_reduction:
+    case OMPD_taskloop:
+    case OMPD_taskloop_simd:
+    case OMPD_unknown:
+      llvm_unreachable("Unexpected directive.");
+    }
+  }
+
+  return false;
+}
+
+static bool supportsSPMDExecutionMode(const OMPExecutableDirective &D) {
+  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
+  switch (DirectiveKind) {
+  case OMPD_target:
+  case OMPD_target_teams:
+  case OMPD_target_teams_distribute:
+    return hasNestedSPMDDirective(D);
+  case OMPD_target_simd:
+  case OMPD_target_parallel:
+  case OMPD_target_parallel_for:
+  case OMPD_target_parallel_for_simd:
+  case OMPD_target_teams_distribute_simd:
+  case OMPD_target_teams_distribute_parallel_for:
+  case OMPD_target_teams_distribute_parallel_for_simd:
+    return true;
+  case OMPD_parallel:
+  case OMPD_for:
+  case OMPD_parallel_for:
+  case OMPD_parallel_sections:
+  case OMPD_for_simd:
+  case OMPD_parallel_for_simd:
+  case OMPD_cancel:
+  case OMPD_cancellation_point:
+  case OMPD_ordered:
+  case OMPD_threadprivate:
+  case OMPD_task:
+  case OMPD_simd:
+  case OMPD_sections:
+  case OMPD_section:
+  case OMPD_single:
+  case OMPD_master:
+  case OMPD_critical:
+  case OMPD_taskyield:
+  case OMPD_barrier:
+  case OMPD_taskwait:
+  case OMPD_taskgroup:
+  case OMPD_atomic:
+  case OMPD_flush:
+  case OMPD_teams:
+  case OMPD_target_data:
+  case OMPD_target_exit_data:
+  case OMPD_target_enter_data:
+  case OMPD_distribute:
+  case OMPD_distribute_simd:
+  case OMPD_distribute_parallel_for:
+  case OMPD_distribute_parallel_for_simd:
+  case OMPD_teams_distribute:
+  case OMPD_teams_distribute_simd:
+  case OMPD_teams_distribute_parallel_for:
+  case OMPD_teams_distribute_parallel_for_simd:
+  case OMPD_target_update:
+  case OMPD_declare_simd:
+  case OMPD_declare_target:
+  case OMPD_end_declare_target:
+  case OMPD_declare_reduction:
+  case OMPD_taskloop:
+  case OMPD_taskloop_simd:
+  case OMPD_unknown:
+    break;
+  }
+  llvm_unreachable(
+      "Unknown programming model for OpenMP directive on NVPTX target.");
 }
 
-void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
+void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
                                              StringRef ParentName,
                                              llvm::Function *&OutlinedFn,
                                              llvm::Constant *&OutlinedFnID,
                                              bool IsOffloadEntry,
                                              const RegionCodeGenTy &CodeGen) {
-  ExecutionModeRAII ModeRAII(CurrentExecutionMode,
-                             CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
+  ExecutionModeRAII ModeRAII(IsInSPMDExecutionMode, /*NewMode=*/false);
   EntryFunctionState EST;
   WorkerFunctionState WST(CGM, D.getLocStart());
   Work.clear();
@@ -613,11 +761,11 @@ void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
         : EST(EST), WST(WST) {}
     void Enter(CodeGenFunction &CGF) override {
       static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
-          .emitGenericEntryHeader(CGF, EST, WST);
+          .emitNonSPMDEntryHeader(CGF, EST, WST);
     }
     void Exit(CodeGenFunction &CGF) override {
       static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
-          .emitGenericEntryFooter(CGF, EST);
+          .emitNonSPMDEntryFooter(CGF, EST);
     }
   } Action(EST, WST);
   CodeGen.setAction(Action);
@@ -633,7 +781,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
 }
 
 // Setup NVPTX threads for master-worker OpenMP scheme.
-void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
+void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
                                                   EntryFunctionState &EST,
                                                   WorkerFunctionState &WST) {
   CGBuilderTy &Bld = CGF.Builder;
@@ -657,6 +805,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
   Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
 
   CGF.EmitBlock(MasterBB);
+  IsInTargetMasterThreadRegion = true;
   // SEQUENTIAL (MASTER) REGION START
   // First action in sequential region:
   // Initialize the state of the OpenMP runtime library on the GPU.
@@ -674,12 +823,14 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
   emitGenericVarsProlog(CGF, WST.Loc);
 }
 
-void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
+void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
                                                   EntryFunctionState &EST) {
-  emitGenericVarsEpilog(CGF);
+  IsInTargetMasterThreadRegion = false;
   if (!CGF.HaveInsertPoint())
     return;
 
+  emitGenericVarsEpilog(CGF);
+
   if (!EST.ExitBB)
     EST.ExitBB = CGF.createBasicBlock(".exit");
 
@@ -707,8 +858,7 @@ void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
                                           llvm::Constant *&OutlinedFnID,
                                           bool IsOffloadEntry,
                                           const RegionCodeGenTy &CodeGen) {
-  ExecutionModeRAII ModeRAII(CurrentExecutionMode,
-                             CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
+  ExecutionModeRAII ModeRAII(IsInSPMDExecutionMode, /*NewMode=*/true);
   EntryFunctionState EST;
 
   // Emit target region as a standalone region.
@@ -754,10 +904,17 @@ void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
   CGF.EmitBranch(ExecuteBB);
 
   CGF.EmitBlock(ExecuteBB);
+
+  emitGenericVarsProlog(CGF, D.getLocStart());
 }
 
 void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
                                                EntryFunctionState &EST) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  emitGenericVarsEpilog(CGF);
+
   if (!EST.ExitBB)
     EST.ExitBB = CGF.createBasicBlock(".exit");
 
@@ -781,11 +938,12 @@ void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
 // 'generic', the runtime reserves one warp for the master, otherwise, all
 // warps participate in parallel work.
 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
-                                     CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {
-  auto *GVMode = new llvm::GlobalVariable(
-      CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
-      llvm::GlobalValue::WeakAnyLinkage,
-      llvm::ConstantInt::get(CGM.Int8Ty, Mode), Twine(Name, "_exec_mode"));
+                                     bool Mode) {
+  auto *GVMode =
+      new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
+                               llvm::GlobalValue::WeakAnyLinkage,
+                               llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
+                               Twine(Name, "_exec_mode"));
   CGM.addCompilerUsedGlobal(GVMode);
 }
 
@@ -846,8 +1004,8 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
   Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
 
   // On termination condition (workid == 0), exit loop.
-  llvm::Value *ShouldTerminate =
-      Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate");
+  llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
+  llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
   Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
 
   // Activate requested workers.
@@ -886,6 +1044,22 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
 
     CGF.EmitBlock(CheckNextBB);
   }
+  // Default case: call to outlined function through pointer if the target
+  // region makes a declare target call that may contain an orphaned parallel
+  // directive.
+  auto *ParallelFnTy =
+      llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
+                              /*isVarArg=*/false)
+          ->getPointerTo();
+  llvm::Value *WorkFnCast = Bld.CreateBitCast(WorkID, ParallelFnTy);
+  // Insert call to work function via shared wrapper. The shared
+  // wrapper takes two arguments:
+  //   - the parallelism level;
+  //   - the thread ID;
+  emitCall(CGF, WST.Loc, WorkFnCast,
+           {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
+  // Go to end of parallel region.
+  CGF.EmitBranch(TerminateBB);
 
   // Signal end of parallel region.
   CGF.EmitBlock(TerminateBB);
@@ -1163,6 +1337,14 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
     break;
   }
+  case OMPRTL_NVPTX__kmpc_parallel_level: {
+    // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
+    llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
+    auto *FnTy =
+        llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
+    break;
+  }
   }
   return RTLFn;
 }
@@ -1198,27 +1380,19 @@ void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
 
   assert(!ParentName.empty() && "Invalid target region parent name!");
 
-  CGOpenMPRuntimeNVPTX::ExecutionMode Mode = getExecutionMode(CGM);
-  switch (Mode) {
-  case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
-    emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
-                      CodeGen);
-    break;
-  case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
+  bool Mode = supportsSPMDExecutionMode(D);
+  if (Mode)
     emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
                    CodeGen);
-    break;
-  case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:
-    llvm_unreachable(
-        "Unknown programming model for OpenMP directive on NVPTX target.");
-  }
+  else
+    emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                      CodeGen);
 
   setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
 }
 
 CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
-    : CGOpenMPRuntime(CGM, "_", "$"),
-      CurrentExecutionMode(ExecutionMode::Unknown) {
+    : CGOpenMPRuntime(CGM, "_", "$") {
   if (!CGM.getLangOpts().OpenMPIsDevice)
     llvm_unreachable("OpenMP NVPTX can only handle device code.");
 }
@@ -1258,23 +1432,32 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
   // Emit target region as a standalone region.
   class NVPTXPrePostActionTy : public PrePostActionTy {
     SourceLocation &Loc;
+    bool &IsInParallelRegion;
+    bool PrevIsInParallelRegion;
 
   public:
-    NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {}
+    NVPTXPrePostActionTy(SourceLocation &Loc, bool &IsInParallelRegion)
+        : Loc(Loc), IsInParallelRegion(IsInParallelRegion) {}
     void Enter(CodeGenFunction &CGF) override {
       static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
           .emitGenericVarsProlog(CGF, Loc);
+      PrevIsInParallelRegion = IsInParallelRegion;
+      IsInParallelRegion = true;
     }
     void Exit(CodeGenFunction &CGF) override {
+      IsInParallelRegion = PrevIsInParallelRegion;
       static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
           .emitGenericVarsEpilog(CGF);
     }
-  } Action(Loc);
+  } Action(Loc, IsInParallelRegion);
   CodeGen.setAction(Action);
+  bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
+  IsInTargetMasterThreadRegion = false;
   auto *OutlinedFun =
       cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
           D, ThreadIDVar, InnermostKind, CodeGen));
-  if (!isInSpmdExecutionMode()) {
+  IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
+  if (!isInSpmdExecutionMode() && !IsInParallelRegion) {
     llvm::Function *WrapperFun =
         createParallelDataSharingWrapper(OutlinedFun, D);
     WrapperFunctionsMap[OutlinedFun] = WrapperFun;
@@ -1316,6 +1499,9 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
 
 void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
                                                  SourceLocation Loc) {
+  if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
+    return;
+
   CGBuilderTy &Bld = CGF.Builder;
 
   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
@@ -1402,6 +1588,9 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
 }
 
 void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF) {
+  if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
+    return;
+
   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
   if (I != FunctionGlobalizedDecls.end()) {
     I->getSecond().MappedParams->restore(CGF);
@@ -1449,31 +1638,61 @@ void CGOpenMPRuntimeNVPTX::emitParallelCall(
   if (isInSpmdExecutionMode())
     emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
   else
-    emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
+    emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
 }
 
-void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
+void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
     CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
     ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
   llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
-  llvm::Function *WFn = WrapperFunctionsMap[Fn];
-
-  assert(WFn && "Wrapper function does not exist!");
 
   // Force inline this outlined function at its call site.
   Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
 
-  auto &&L0ParallelGen = [this, WFn, CapturedVars](CodeGenFunction &CGF,
-                                                    PrePostActionTy &) {
-    CGBuilderTy &Bld = CGF.Builder;
+  Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
+                                           /*DestWidth=*/32, /*Signed=*/1),
+                                       ".zero.addr");
+  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
+  Address ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
+  auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, ThreadIDAddr](
+                       CodeGenFunction &CGF, PrePostActionTy &Action) {
+    Action.Enter(CGF);
+
+    llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
+    OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
+    OutlinedFnArgs.push_back(ZeroAddr.getPointer());
+    OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
+    emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
+  };
+  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
+                                        PrePostActionTy &) {
+
+    RegionCodeGenTy RCG(CodeGen);
+    llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
+    llvm::Value *ThreadID = getThreadID(CGF, Loc);
+    llvm::Value *Args[] = {RTLoc, ThreadID};
+
+    NVPTXActionTy Action(
+        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
+        Args,
+        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
+        Args);
+    RCG.setAction(Action);
+    RCG(CGF);
+  };
 
+  auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
+                                                  PrePostActionTy &Action) {
+    CGBuilderTy &Bld = CGF.Builder;
+    llvm::Function *WFn = WrapperFunctionsMap[Fn];
+    assert(WFn && "Wrapper function does not exist!");
     llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
 
     // Prepare for parallel region. Indicate the outlined function.
     llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
-    CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
-                            OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
-                        Args);
+    CGF.EmitRuntimeCall(
+        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
+        Args);
 
     // Create a private scope that will globalize the arguments
     // passed from the outside of the target region.
@@ -1496,13 +1715,13 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
       // Store variable address in a list of references to pass to workers.
       unsigned Idx = 0;
       ASTContext &Ctx = CGF.getContext();
-      Address SharedArgListAddress = CGF.EmitLoadOfPointer(SharedArgs,
-          Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
-              .castAs<PointerType>());
+      Address SharedArgListAddress = CGF.EmitLoadOfPointer(
+          SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
+                          .castAs<PointerType>());
       for (llvm::Value *V : CapturedVars) {
-        Address Dst = Bld.CreateConstInBoundsGEP(
-            SharedArgListAddress, Idx, CGF.getPointerSize());
-        llvm::Value * PtrV;
+        Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
+                                                 CGF.getPointerSize());
+        llvm::Value *PtrV;
         if (V->getType()->isIntegerTy())
           PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
         else
@@ -1533,43 +1752,67 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
     Work.emplace_back(WFn);
   };
 
-  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
-  llvm::Value *ThreadID = getThreadID(CGF, Loc);
-  llvm::Value *Args[] = {RTLoc, ThreadID};
-
-  auto &&SeqGen = [this, Fn, CapturedVars, &Args, Loc](CodeGenFunction &CGF,
-                                                       PrePostActionTy &) {
-    auto &&CodeGen = [this, Fn, CapturedVars, Loc](CodeGenFunction &CGF,
-                                                   PrePostActionTy &Action) {
-      Action.Enter(CGF);
-
-      llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
-      Address ZeroAddr =
-          CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
-                                /*DestWidth=*/32, /*Signed=*/1),
-                            ".zero.addr");
-      CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
-      OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
-      OutlinedFnArgs.push_back(ZeroAddr.getPointer());
-      OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
-      emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
-    };
-
+  auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen, &CodeGen](
+                             CodeGenFunction &CGF, PrePostActionTy &Action) {
     RegionCodeGenTy RCG(CodeGen);
-    NVPTXActionTy Action(
-        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
-        Args,
-        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
-        Args);
-    RCG.setAction(Action);
-    RCG(CGF);
+    if (IsInParallelRegion) {
+      SeqGen(CGF, Action);
+    } else if (IsInTargetMasterThreadRegion) {
+      L0ParallelGen(CGF, Action);
+    } else {
+      // Check for master and then parallelism:
+      // if (is_master) {
+      //   Worker call.
+      // } else if (__kmpc_parallel_level(loc, gtid)) {
+      //   Serialized execution.
+      // } else {
+      //   Outlined function call.
+      // }
+      CGBuilderTy &Bld = CGF.Builder;
+      llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
+      if (!isInSpmdExecutionMode()) {
+        llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
+        llvm::BasicBlock *ParallelCheckBB =
+            CGF.createBasicBlock(".parallelcheck");
+        llvm::Value *IsMaster =
+            Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
+        Bld.CreateCondBr(IsMaster, MasterCheckBB, ParallelCheckBB);
+        CGF.EmitBlock(MasterCheckBB);
+        L0ParallelGen(CGF, Action);
+        CGF.EmitBranch(ExitBB);
+        // There is no need to emit line number for unconditional branch.
+        (void)ApplyDebugLocation::CreateEmpty(CGF);
+        CGF.EmitBlock(ParallelCheckBB);
+      }
+      llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
+      llvm::Value *ThreadID = getThreadID(CGF, Loc);
+      llvm::Value *PL = CGF.EmitRuntimeCall(
+          createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
+          {RTLoc, ThreadID});
+      llvm::Value *Res = Bld.CreateIsNotNull(PL);
+      llvm::BasicBlock *ThenBlock = CGF.createBasicBlock("omp_if.then");
+      llvm::BasicBlock *ElseBlock = CGF.createBasicBlock("omp_if.else");
+      Bld.CreateCondBr(Res, ThenBlock, ElseBlock);
+      // Emit the 'then' code.
+      CGF.EmitBlock(ThenBlock);
+      SeqGen(CGF, Action);
+      // There is no need to emit line number for unconditional branch.
+      (void)ApplyDebugLocation::CreateEmpty(CGF);
+      // Emit the 'else' code.
+      CGF.EmitBlock(ElseBlock);
+      RCG(CGF);
+      // There is no need to emit line number for unconditional branch.
+      (void)ApplyDebugLocation::CreateEmpty(CGF);
+      // Emit the continuation block for code after the if.
+      CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
+    }
   };
 
   if (IfCond) {
-    emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen);
+    emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
   } else {
     CodeGenFunction::RunCleanupsScope Scope(CGF);
-    RegionCodeGenTy ThenRCG(L0ParallelGen);
+    RegionCodeGenTy ThenRCG(LNParallelGen);
     ThenRCG(CGF);
   }
 }
@@ -3090,6 +3333,9 @@ llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
 
 void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
                                               const Decl *D) {
+  if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
+    return;
+
   assert(D && "Expected function or captured|block decl.");
   assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
          "Function is registered already.");
@@ -3143,6 +3389,9 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
 
 Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
                                                         const VarDecl *VD) {
+  if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
+    return Address::invalid();
+
   VD = VD->getCanonicalDecl();
   auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
   if (I == FunctionGlobalizedDecls.end())
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
index 365d4f52aa..a5f39c28a7 100644
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -25,7 +25,7 @@ namespace CodeGen {
 
 class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime {
 private:
-  // Parallel outlined function work for workers to execute.
+  /// Parallel outlined function work for workers to execute.
   llvm::SmallVector<llvm::Function *, 16> Work;
 
   struct EntryFunctionState {
@@ -52,14 +52,14 @@ private:
   /// \brief Helper for worker function. Emit body of worker loop.
   void emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST);
 
-  /// \brief Helper for generic target entry function. Guide the master and
+  /// \brief Helper for non-SPMD target entry function. Guide the master and
   /// worker threads to their respective locations.
-  void emitGenericEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
+  void emitNonSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
                               WorkerFunctionState &WST);
 
-  /// \brief Signal termination of OMP execution for generic target entry
+  /// \brief Signal termination of OMP execution for non-SPMD target entry
   /// function.
-  void emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
+  void emitNonSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
 
   /// Helper for generic variables globalization prolog.
   void emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc);
@@ -93,7 +93,7 @@ private:
   /// \param IsOffloadEntry True if the outlined function is an offload entry.
   /// An outlined function may not be an entry if, e.g. the if clause always
   /// evaluates to false.
-  void emitGenericKernel(const OMPExecutableDirective &D, StringRef ParentName,
+  void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
                          llvm::Function *&OutlinedFn,
                          llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
                          const RegionCodeGenTy &CodeGen);
@@ -133,14 +133,14 @@ private:
   /// \brief Emits code for parallel or serial call of the \a OutlinedFn with
   /// variables captured in a record which address is stored in \a
   /// CapturedStruct.
-  /// This call is for the Generic Execution Mode.
+  /// This call is for the Non-SPMD Execution Mode.
   /// \param OutlinedFn Outlined function to be run in parallel threads. Type of
   /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
   /// \param CapturedVars A pointer to the record with the references to
   /// variables used in \a OutlinedFn function.
   /// \param IfCond Condition in the associated 'if' clause, if it was
   /// specified, nullptr otherwise.
-  void emitGenericParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
+  void emitNonSPMDParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
                                llvm::Value *OutlinedFn,
                                ArrayRef<llvm::Value *> CapturedVars,
                                const Expr *IfCond);
@@ -304,15 +304,15 @@ public:
   Address getAddressOfLocalVariable(CodeGenFunction &CGF,
                                     const VarDecl *VD) override;
 
-  /// Target codegen is specialized based on two programming models: the
-  /// 'generic' fork-join model of OpenMP, and a more GPU efficient 'spmd'
-  /// model for constructs like 'target parallel' that support it.
-  enum ExecutionMode {
-    /// Single Program Multiple Data.
-    Spmd,
-    /// Generic codegen to support fork-join model.
+  /// Target codegen is specialized based on two data-sharing modes: CUDA, in
+  /// which the local variables are actually global threadlocal, and Generic, in
+  /// which the local variables are placed in global memory if they may escape
+  /// their declaration context.
+  enum DataSharingMode {
+    /// CUDA data sharing mode.
+    CUDA,
+    /// Generic data-sharing mode.
     Generic,
-    Unknown,
   };
 
   /// Cleans up references to the objects in finished function.
@@ -320,11 +320,17 @@ public:
   void functionFinished(CodeGenFunction &CGF) override;
 
 private:
-  // Track the execution mode when codegening directives within a target
-  // region. The appropriate mode (generic/spmd) is set on entry to the
-  // target region and used by containing directives such as 'parallel'
-  // to emit optimized code.
-  ExecutionMode CurrentExecutionMode;
+  /// Track the execution mode when codegening directives within a target
+  /// region. The appropriate mode (SPMD/NON-SPMD) is set on entry to the
+  /// target region and used by containing directives such as 'parallel'
+  /// to emit optimized code.
+  bool IsInSPMDExecutionMode = false;
+
+  /// true if we're emitting the code for the target region and next parallel
+  /// region is L0 for sure.
+  bool IsInTargetMasterThreadRegion = false;
+  /// true if we're definitely in the parallel region.
+  bool IsInParallelRegion = false;
 
   /// Map between an outlined function and its wrapper.
   llvm::DenseMap<llvm::Function *, llvm::Function *> WrapperFunctionsMap;
diff --git a/lib/CodeGen/CodeGenModule.cpp b/lib/CodeGen/CodeGenModule.cpp
index 6e3747a188..4bb3c7b0d3 100644
--- a/lib/CodeGen/CodeGenModule.cpp
+++ b/lib/CodeGen/CodeGenModule.cpp
@@ -2399,8 +2399,17 @@ llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction(
     // For the device mark the function as one that should be emitted.
     if (getLangOpts().OpenMPIsDevice && OpenMPRuntime &&
         !OpenMPRuntime->markAsGlobalTarget(GD) && FD->isDefined() &&
-        !DontDefer && !IsForDefinition)
-      addDeferredDeclToEmit(GD);
+        !DontDefer && !IsForDefinition) {
+      const FunctionDecl *FDDef = FD->getDefinition();
+      GlobalDecl GDDef;
+      if (const auto *CD = dyn_cast<CXXConstructorDecl>(FDDef))
+        GDDef = GlobalDecl(CD, GD.getCtorType());
+      else if (const auto *DD = dyn_cast<CXXDestructorDecl>(FDDef))
+        GDDef = GlobalDecl(DD, GD.getDtorType());
+      else
+        GDDef = GlobalDecl(FDDef);
+      addDeferredDeclToEmit(GDDef);
+    }
 
     if (FD->isMultiVersion() && FD->getAttr<TargetAttr>()->isDefaultVersion()) {
       UpdateMultiVersionNames(GD, FD);
diff --git a/test/OpenMP/nvptx_target_codegen.cpp b/test/OpenMP/nvptx_target_codegen.cpp
index 70f3973acd..718c650bec 100644
--- a/test/OpenMP/nvptx_target_codegen.cpp
+++ b/test/OpenMP/nvptx_target_codegen.cpp
@@ -9,15 +9,17 @@
 #define HEADER
 
 // Check that the execution mode of all 6 target regions is set to Generic Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l100}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l177}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l287}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l324}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l342}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l307}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l102}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l179}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l289}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l326}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l344}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l309}}_exec_mode = weak constant i8 1
 
 __thread int id;
 
+int baz(int f);
+
 template<typename tx, typename ty>
 struct TT{
   tx X;
@@ -33,7 +35,7 @@ int foo(int n) {
   double cn[5][n];
   TT<long long, char> d;
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l100}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l102}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -64,7 +66,7 @@ int foo(int n) {
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l100]]()
+  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l102]]()
   // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
   // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
@@ -106,7 +108,7 @@ int foo(int n) {
   {
   }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l177}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l179}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -137,7 +139,7 @@ int foo(int n) {
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l177]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]], i[[SZ:32|64]] [[ID:%[a-zA-Z_]+]])
+  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l179]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]], i[[SZ:32|64]] [[ID:%[a-zA-Z_]+]])
   // CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]],
   // CHECK: store i[[SZ]] [[ARG1]], i[[SZ]]* [[AA_ADDR]],
   // CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16*
@@ -180,7 +182,7 @@ int foo(int n) {
     id = aa;
   }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l287}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l289}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -211,7 +213,7 @@ int foo(int n) {
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l287]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l289]](i[[SZ]]
   // Create local storage for each capture.
   // CHECK:    [[LOCAL_A:%.+]] = alloca i[[SZ]]
   // CHECK:    [[LOCAL_B:%.+]] = alloca [10 x float]*
@@ -343,6 +345,7 @@ struct S1 {
     {
       this->a = (double)b + 1.5;
       c[1][1] = ++a;
+      baz(a);
     }
 
     return c[1][1] + (int)b;
@@ -364,7 +367,13 @@ int bar(int n){
   return a;
 }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+324}}_worker()
+int baz(int f) {
+#pragma omp parallel
+  f = 2;
+  return f;
+}
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+326}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -395,7 +404,7 @@ int bar(int n){
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l324]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l326]](i[[SZ]]
   // Create local storage for each capture.
   // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]]
   // CHECK:  [[LOCAL_AA:%.+]] = alloca i[[SZ]]
@@ -450,9 +459,10 @@ int bar(int n){
 
 
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l342}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l344}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+  // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t*
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
   // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
   // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
@@ -469,6 +479,8 @@ int bar(int n){
   // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
   //
   // CHECK: [[EXEC_PARALLEL]]
+  // CHECK: [[WORK_FN:%.+]] = bitcast i8* [[WORK]] to void (i16, i32)*
+  // CHECK: call void [[WORK_FN]](i16 0, i32 [[GTID]])
   // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
   //
   // CHECK: [[TERM_PARALLEL]]
@@ -481,7 +493,7 @@ int bar(int n){
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l342]](
+  // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l344]](
   // Create local storage for each capture.
   // CHECK:       [[LOCAL_THIS:%.+]] = alloca [[S1:%struct.*]]*
   // CHECK:       [[LOCAL_B:%.+]] = alloca i[[SZ]]
@@ -528,6 +540,7 @@ int bar(int n){
   // CHECK-64-DAG:load i32, i32* [[REF_B]]
   // CHECK-32-DAG:load i32, i32* [[LOCAL_B]]
   // CHECK-DAG:   getelementptr inbounds i16, i16* [[REF_C]], i[[SZ]] %{{.+}}
+  // CHECK: call i32 [[BAZ:@.*baz.*]](i32 %
   // CHECK: br label {{%?}}[[TERMINATE:.+]]
   //
   // CHECK: [[TERMINATE]]
@@ -538,9 +551,48 @@ int bar(int n){
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
+  // CHECK: define i32 [[BAZ]](i32 [[F:%.*]])
+  // CHECK: [[ZERO_ADDR:%.+]] = alloca i32,
+  // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t*
+  // CHECK: [[GTID_ADDR:%.+]] = alloca i32,
+  // CHECK: store i32 0, i32* [[ZERO_ADDR]]
+  // CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 4, i16 0)
+  // CHECK: [[REC_ADDR:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty*
+  // CHECK: [[F_PTR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[REC_ADDR]], i32 0, i32 0
+  // CHECK: store i32 [[F]], i32* [[F_PTR]],
+  // CHECK: store i32 [[GTID]], i32* [[GTID_ADDR]],
+  // CHECK: icmp eq i32
+  // CHECK: br i1
+
+  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @{{.+}} to i8*), i16 1)
+  // CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[SHARED_PTR:%.+]], i{{64|32}} 1)
+  // CHECK: [[SHARED:%.+]] = load i8**, i8*** [[SHARED_PTR]],
+  // CHECK: [[REF:%.+]] = getelementptr inbounds i8*, i8** [[SHARED]], i{{64|32}} 0
+  // CHECK: [[F_REF:%.+]] = bitcast i32* [[F_PTR]] to i8*
+  // CHECK: store i8* [[F_REF]], i8** [[REF]],
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: call void @__kmpc_end_sharing_variables()
+  // CHECK: br label
+
+  // CHECK: [[RES:%.+]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @{{.+}}, i32 [[GTID]])
+  // CHECK: icmp ne i16 [[RES]], 0
+  // CHECK: br i1
+
+  // CHECK: call void @__kmpc_serialized_parallel(%struct.ident_t* @{{.+}}, i32 [[GTID]])
+  // CHECK: call void @__omp_outlined__(i32* [[GTID_ADDR]], i32* [[ZERO_ADDR]], i32* [[F_PTR]])
+  // CHECK: call void @__kmpc_end_serialized_parallel(%struct.ident_t* @{{.+}}, i32 [[GTID]])
+  // CHECK: br label
+
+  // CHECK: call void @__omp_outlined__(i32* [[GTID_ADDR]], i32* [[ZERO_ADDR]], i32* [[F_PTR]])
+  // CHECK: br label
+
+  // CHECK: [[RES:%.+]] = load i32, i32* [[F_PTR]],
+  // CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[PTR]])
+  // CHECK: ret i32 [[RES]]
 
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l307}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l309}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -571,7 +623,7 @@ int bar(int n){
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l307]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l309]](i[[SZ]]
   // Create local storage for each capture.
   // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]]
   // CHECK:  [[LOCAL_AA:%.+]] = alloca i[[SZ]]
diff --git a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
index ac1aae840c..7594d6f2d2 100644
--- a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
+++ b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
@@ -19,45 +19,20 @@ int main(int argc, char **argv) {
   return 0;
 }
 
-// CHECK: define internal void @__omp_offloading_{{.*}}_main_l[[@LINE-6]]_worker()
-// CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @
-// CHECK: call void [[PARALLEL:@.+]]_wrapper(i16 0, i32 [[TID]])
+// CHECK: @__omp_offloading_{{.*}}_main_l16_exec_mode = weak constant i8 0
 
-// CHECK: define void @__omp_offloading_{{.*}}_main_l[[@LINE-10]](i{{64|32}} %{{[^,].*}}, i32* dereferenceable{{[^,]*}}, i{{64|32}} %{{[^,)]*}})
+// CHECK: define void @__omp_offloading_{{.*}}_main_l16(i{{64|32}} %{{[^,].*}}, i32* dereferenceable{{[^,]*}}, i{{64|32}} %{{[^,)]*}})
 // CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @
-// CHECK: call void @__kmpc_kernel_init(
-// CHECK: call void @__kmpc_data_sharing_init_stack()
+// CHECK: call void @__kmpc_spmd_kernel_init(
 // CHECK: call void @__kmpc_for_static_init_4(
-// CHECK: call void @__kmpc_kernel_prepare_parallel(
-// CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[BUF_PTR_PTR:%[^,]+]], i{{64|32}} 4)
-// CHECK: [[BUF_PTR:%.+]] = load i8**, i8*** [[BUF_PTR_PTR]],
-// CHECK: [[LB:%.+]] = inttoptr i{{64|32}} [[LB_:%.*]] to i8*
-// CHECK: store i8* [[LB]], i8** [[BUF_PTR]],
-// CHECK: [[BUF_PTR1:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 1
-// CHECK: [[UB:%.+]] = inttoptr i{{64|32}} [[UB_:%.*]] to i8*
-// CHECK: store i8* [[UB]], i8** [[BUF_PTR1]],
-// CHECK: [[BUF_PTR2:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 2
-// CHECK: [[ARGC:%.+]] = inttoptr i{{64|32}} [[ARGC_:%.*]] to i8*
-// CHECK: store i8* [[ARGC]], i8** [[BUF_PTR2]],
-// CHECK: [[BUF_PTR3:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 3
-// CHECK: [[A_PTR:%.+]] = bitcast i32* [[A_ADDR:%.*]] to i8*
-// CHECK: store i8* [[A_PTR]], i8** [[BUF_PTR3]],
-// CHECK: call void @llvm.nvvm.barrier0()
-// CHECK: call void @llvm.nvvm.barrier0()
-// CHECK: call void @__kmpc_end_sharing_variables()
-// CHECK: br label
 
-// CHECK: call void @__kmpc_serialized_parallel(%struct.ident_t* @
-// CHECK: [[GTID_ADDR:%.*]] = load i32*, i32** %
-// CHECK: call void [[PARALLEL]](i32* [[GTID_ADDR]], i32* %{{.+}}, i{{64|32}} [[LB_]], i{{64|32}} [[UB_]], i{{64|32}} [[ARGC_]], i32* [[A_ADDR]])
-// CHECK: call void @__kmpc_end_serialized_parallel(%struct.ident_t* @
+// CHECK: call void [[PARALLEL:@.+]](i32* %{{.*}}, i32* %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.*}}, i{{64|32}} %{{.*}}, i32* %{{.*}})
 // CHECK: br label %
 
 
 // CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @
 
-// CHECK: call void @__kmpc_kernel_deinit(i16 1)
-// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: call void @__kmpc_spmd_kernel_deinit()
 
 // CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.+}}, i32* dereferenceable{{.*}})
 // CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 8, i16 0)
@@ -75,24 +50,4 @@ int main(int argc, char **argv) {
 
 // CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[RES]])
 
-// define internal void [[PARALLEL]]_wrapper(i16 zeroext, i32)
-// CHECK: call void @__kmpc_get_shared_variables(i8*** [[BUF_PTR_PTR:%.+]])
-// CHECK: [[BUF_PTR:%.+]] = load i8**, i8*** [[BUF_PTR_PTR]],
-// CHECK: [[BUF_PTR0:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 0
-// CHECK: [[LB_PTR:%.+]] = bitcast i8** [[BUF_PTR0]] to i{{64|32}}*
-// CHECK: [[LB:%.+]] = load i{{64|32}}, i{{64|32}}* [[LB_PTR]],
-// CHECK: [[BUF_PTR1:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 1
-// CHECK: [[UB_PTR:%.+]] = bitcast i8** [[BUF_PTR1]] to i{{64|32}}*
-// CHECK: [[UB:%.+]] = load i{{64|32}}, i{{64|32}}* [[UB_PTR]],
-// CHECK: [[BUF_PTR2:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 2
-// CHECK: [[ARGC_ADDR:%.+]] = bitcast i8** [[BUF_PTR2]] to i32*
-// CHECK: [[ARGC:%.+]] = load i32, i32* [[ARGC_ADDR]],
-// CHECK-64: [[ARGC_CAST:%.+]] = zext i32 [[ARGC]] to i64
-// CHECK: [[BUF_PTR3:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 3
-// CHECK: [[A_ADDR_REF:%.+]] = bitcast i8** [[BUF_PTR3]] to i32**
-// CHECK: [[A_ADDR:%.+]] = load i32*, i32** [[A_ADDR_REF]],
-// CHECK-64: call void [[PARALLEL]](i32* %{{.+}}, i32* %{{.+}}, i64 [[LB]], i64 [[UB]], i64 [[ARGC_CAST]], i32* [[A_ADDR]])
-// CHECK-32: call void [[PARALLEL]](i32* %{{.+}}, i32* %{{.+}}, i32 [[LB]], i32 [[UB]], i32 [[ARGC]], i32* [[A_ADDR]])
-// CHECK: ret void
-
 #endif
-- 
2.40.0