]> granicus.if.org Git - clang/commitdiff
[OPENMP, NVPTX] Improve globalization of the variables captured by value.
authorAlexey Bataev <a.bataev@hotmail.com>
Thu, 15 Mar 2018 18:10:54 +0000 (18:10 +0000)
committerAlexey Bataev <a.bataev@hotmail.com>
Thu, 15 Mar 2018 18:10:54 +0000 (18:10 +0000)
If the variable is captured by value and the corresponding parameter in
the outlined function escapes its declaration context, this parameter
must be globalized. To globalize it we need to get the address of the
original parameter, load the value, store it to the global address and
use this global address instead of the original.

Patch improves globalization for parallel|teams regions + functions in
declare target regions.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@327654 91177308-0d34-0410-b5e6-96231b3b80d8

lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
lib/CodeGen/CGOpenMPRuntimeNVPTX.h
lib/CodeGen/CGStmtOpenMP.cpp
test/OpenMP/declare_target_codegen_globalization.cpp [new file with mode: 0644]
test/OpenMP/nvptx_teams_codegen.cpp

index b71965cfb531c7a27adb5a4eddf584cc4b7fa237..637a86b4553477064404adf59e5cdc8de0a3f636 100644 (file)
@@ -171,16 +171,23 @@ class CheckVarsEscapingDeclContext final
     : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
   CodeGenFunction &CGF;
   llvm::SetVector<const ValueDecl *> EscapedDecls;
+  llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
   llvm::SmallPtrSet<const ValueDecl *, 4> IgnoredDecls;
   bool AllEscaped = false;
   RecordDecl *GlobalizedRD = nullptr;
   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
 
   void markAsEscaped(const ValueDecl *VD) {
-    if (IgnoredDecls.count(VD) ||
-        (CGF.CapturedStmtInfo &&
-         CGF.CapturedStmtInfo->lookup(cast<VarDecl>(VD))))
+    if (IgnoredDecls.count(VD))
       return;
+    // Variables captured by value must be globalized.
+    if (auto *CSI = CGF.CapturedStmtInfo) {
+      if (const FieldDecl *FD = CGF.CapturedStmtInfo->lookup(cast<VarDecl>(VD))) {
+        if (FD->getType()->isReferenceType())
+          return;
+        EscapedParameters.insert(VD);
+      }
+    }
     EscapedDecls.insert(VD);
   }
 
@@ -385,12 +392,15 @@ public:
         Visit(Child);
   }
 
+  /// Returns the record that handles all the escaped local variables and used
+  /// instead of their original storage.
   const RecordDecl *getGlobalizedRecord() {
     if (!GlobalizedRD)
       buildRecordForGlobalizedVars();
     return GlobalizedRD;
   }
 
+  /// Returns the field in the globalized record for the escaped variable.
   const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
     assert(GlobalizedRD &&
            "Record for globalized variables must be generated already.");
@@ -400,9 +410,16 @@ public:
     return I->getSecond();
   }
 
+  /// Returns the list of the escaped local variables/parameters.
   ArrayRef<const ValueDecl *> getEscapedDecls() const {
     return EscapedDecls.getArrayRef();
   }
+
+  /// Checks if the escaped local variable is actually a parameter passed by
+  /// value.
+  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
+    return EscapedParameters;
+  }
 };
 } // anonymous namespace
 
@@ -614,58 +631,14 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
       createNVPTXRuntimeFunction(
           OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
 
-  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
-  if (I == FunctionGlobalizedDecls.end())
-    return;
-  const RecordDecl *GlobalizedVarsRecord = I->getSecond().first;
-  QualType RecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
-
-  // Recover pointer to this function's global record. The runtime will
-  // handle the specifics of the allocation of the memory.
-  // Use actual memory size of the record including the padding
-  // for alignment purposes.
-  unsigned Alignment =
-      CGM.getContext().getTypeAlignInChars(RecTy).getQuantity();
-  unsigned GlobalRecordSize =
-      CGM.getContext().getTypeSizeInChars(RecTy).getQuantity();
-  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
-  // TODO: allow the usage of shared memory to be controlled by
-  // the user, for now, default to global.
-  llvm::Value *GlobalRecordSizeArg[] = {
-      llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
-      CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
-  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
-      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
-      GlobalRecordSizeArg);
-  llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-      GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo());
-  FunctionToGlobalRecPtr.try_emplace(CGF.CurFn, GlobalRecValue);
-
-  // Emit the "global alloca" which is a GEP from the global declaration record
-  // using the pointer returned by the runtime.
-  LValue Base = CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy);
-  auto &Res = I->getSecond().second;
-  for (auto &Rec : Res) {
-    const FieldDecl *FD = Rec.second.first;
-    LValue VarAddr = CGF.EmitLValueForField(Base, FD);
-    Rec.second.second = VarAddr.getAddress();
-  }
+  emitGenericVarsProlog(CGF, WST.Loc);
 }
 
 void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
                                                   EntryFunctionState &EST) {
-  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
-  if (I != FunctionGlobalizedDecls.end()) {
-    if (!CGF.HaveInsertPoint())
-      return;
-    auto I = FunctionToGlobalRecPtr.find(CGF.CurFn);
-    if (I != FunctionToGlobalRecPtr.end()) {
-      llvm::Value *Args[] = {I->getSecond()};
-      CGF.EmitRuntimeCall(
-          createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
-          Args);
-    }
-  }
+  emitGenericVarsEpilog(CGF);
+  if (!CGF.HaveInsertPoint())
+    return;
 
   if (!EST.ExitBB)
     EST.ExitBB = CGF.createBasicBlock(".exit");
@@ -1207,6 +1180,24 @@ void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
 llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
+  SourceLocation Loc = D.getLocStart();
+
+  // Emit target region as a standalone region.
+  class NVPTXPrePostActionTy : public PrePostActionTy {
+    SourceLocation &Loc;
+
+  public:
+    NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {}
+    void Enter(CodeGenFunction &CGF) override {
+      static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
+          .emitGenericVarsProlog(CGF, Loc);
+    }
+    void Exit(CodeGenFunction &CGF) override {
+      static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
+          .emitGenericVarsEpilog(CGF);
+    }
+  } Action(Loc);
+  CodeGen.setAction(Action);
   auto *OutlinedFun =
       cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
           D, ThreadIDVar, InnermostKind, CodeGen));
@@ -1222,7 +1213,24 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
 llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
+  SourceLocation Loc = D.getLocStart();
 
+  // Emit target region as a standalone region.
+  class NVPTXPrePostActionTy : public PrePostActionTy {
+    SourceLocation &Loc;
+
+  public:
+    NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {}
+    void Enter(CodeGenFunction &CGF) override {
+      static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
+          .emitGenericVarsProlog(CGF, Loc);
+    }
+    void Exit(CodeGenFunction &CGF) override {
+      static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
+          .emitGenericVarsEpilog(CGF);
+    }
+  } Action(Loc);
+  CodeGen.setAction(Action);
   llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
       D, ThreadIDVar, InnermostKind, CodeGen);
   llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
@@ -1233,6 +1241,73 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
   return OutlinedFun;
 }
 
+void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
+                                                 SourceLocation Loc) {
+  CGBuilderTy &Bld = CGF.Builder;
+
+  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
+  if (I == FunctionGlobalizedDecls.end())
+    return;
+  const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord;
+  QualType RecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
+
+  // Recover pointer to this function's global record. The runtime will
+  // handle the specifics of the allocation of the memory.
+  // Use actual memory size of the record including the padding
+  // for alignment purposes.
+  unsigned Alignment =
+      CGM.getContext().getTypeAlignInChars(RecTy).getQuantity();
+  unsigned GlobalRecordSize =
+      CGM.getContext().getTypeSizeInChars(RecTy).getQuantity();
+  GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
+  // TODO: allow the usage of shared memory to be controlled by
+  // the user, for now, default to global.
+  llvm::Value *GlobalRecordSizeArg[] = {
+      llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
+      CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
+  llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
+      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
+      GlobalRecordSizeArg);
+  llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
+      GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo());
+  LValue Base = CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy);
+  I->getSecond().GlobalRecordAddr = GlobalRecValue;
+
+  // Emit the "global alloca" which is a GEP from the global declaration record
+  // using the pointer returned by the runtime.
+  for (auto &Rec : I->getSecond().LocalVarData) {
+    bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
+    llvm::Value *ParValue;
+    if (EscapedParam) {
+      const auto *VD = cast<VarDecl>(Rec.first);
+      LValue ParLVal =
+          CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
+      ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
+    }
+    const FieldDecl *FD = Rec.second.first;
+    LValue VarAddr = CGF.EmitLValueForField(Base, FD);
+    Rec.second.second = VarAddr.getAddress();
+    if (EscapedParam) {
+      const auto *VD = cast<VarDecl>(Rec.first);
+      CGF.EmitStoreOfScalar(ParValue, VarAddr);
+      I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
+    }
+  }
+  I->getSecond().MappedParams->apply(CGF);
+}
+
+void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF) {
+  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
+  if (I != FunctionGlobalizedDecls.end() && I->getSecond().GlobalRecordAddr) {
+    I->getSecond().MappedParams->restore(CGF);
+    if (!CGF.HaveInsertPoint())
+      return;
+    CGF.EmitRuntimeCall(
+        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
+        I->getSecond().GlobalRecordAddr);
+  }
+}
+
 void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
                                          const OMPExecutableDirective &D,
                                          SourceLocation Loc,
@@ -1317,7 +1392,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
         llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy);
         CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
                               Ctx.getPointerType(Ctx.VoidPtrTy));
-        Idx++;
+        ++Idx;
       }
     }
 
@@ -2750,8 +2825,8 @@ void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
       continue;
     }
     llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-        NativeArg, NativeArg->getType()->getPointerElementType()->getPointerTo(
-                       /*AddrSpace=*/0));
+        NativeArg,
+        NativeArg->getType()->getPointerElementType()->getPointerTo());
     TargetArgs.emplace_back(
         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
   }
@@ -2788,7 +2863,7 @@ llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
   auto *Fn = llvm::Function::Create(
       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
       OutlinedParallelFn->getName() + "_wrapper", &CGM.getModule());
-  CGM.SetInternalFunctionAttributes(/*D=*/GlobalDecl(), Fn, CGFI);
+  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
   Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
 
   CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
@@ -2854,6 +2929,7 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
          "Function is registered already.");
   SmallVector<const ValueDecl *, 4> IgnoredDecls;
   const Stmt *Body = nullptr;
+  bool NeedToDelayGlobalization = false;
   if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
     Body = FD->getBody();
   } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
@@ -2861,6 +2937,7 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
   } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
     Body = CD->getBody();
     if (CGF.CapturedStmtInfo->getKind() == CR_OpenMP) {
+      NeedToDelayGlobalization = true;
       if (const auto *CS = dyn_cast<CapturedStmt>(Body)) {
         IgnoredDecls.reserve(CS->capture_size());
         for (const auto &Capture : CS->captures())
@@ -2876,14 +2953,29 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
   const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord();
   if (!GlobalizedVarsRecord)
     return;
-  auto &Res =
-      FunctionGlobalizedDecls
-          .try_emplace(CGF.CurFn, GlobalizedVarsRecord, DeclToAddrMapTy())
-          .first->getSecond()
-          .second;
+  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
+  I->getSecond().MappedParams =
+      llvm::make_unique<CodeGenFunction::OMPMapVars>();
+  I->getSecond().GlobalRecord = GlobalizedVarsRecord;
+  I->getSecond().EscapedParameters.insert(
+      VarChecker.getEscapedParameters().begin(),
+      VarChecker.getEscapedParameters().end());
+  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
   for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
     const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
-    Res.insert(std::make_pair(VD, std::make_pair(FD, Address::invalid())));
+    Data.insert(std::make_pair(VD, std::make_pair(FD, Address::invalid())));
+  }
+  if (!NeedToDelayGlobalization) {
+    emitGenericVarsProlog(CGF, D->getLocStart());
+    struct GlobalizationScope final : EHScopeStack::Cleanup {
+      GlobalizationScope() = default;
+
+      void Emit(CodeGenFunction &CGF, Flags flags) override {
+        static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
+            .emitGenericVarsEpilog(CGF);
+      }
+    };
+    CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
   }
 }
 
@@ -2892,14 +2984,13 @@ Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
   auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
   if (I == FunctionGlobalizedDecls.end())
     return Address::invalid();
-  auto VDI = I->getSecond().second.find(VD);
-  if (VDI == I->getSecond().second.end())
+  auto VDI = I->getSecond().LocalVarData.find(VD);
+  if (VDI == I->getSecond().LocalVarData.end())
     return Address::invalid();
   return VDI->second.second;
 }
 
 void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) {
-  FunctionToGlobalRecPtr.erase(CGF.CurFn);
   FunctionGlobalizedDecls.erase(CGF.CurFn);
   CGOpenMPRuntime::functionFinished(CGF);
 }
index ee1d727a57333fb0c4a163eb159288c169baeffc..7b1944dca565a6f0212f9f43a716053699394214 100644 (file)
@@ -61,6 +61,12 @@ private:
   /// function.
   void emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
 
+  /// Helper for generic variables globalization prolog.
+  void emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc);
+
+  /// Helper for generic variables globalization epilog.
+  void emitGenericVarsEpilog(CodeGenFunction &CGF);
+
   /// \brief Helper for Spmd mode target directive's entry function.
   void emitSpmdEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
                            const OMPExecutableDirective &D);
@@ -332,13 +338,18 @@ private:
   /// The map of local variables to their addresses in the global memory.
   using DeclToAddrMapTy = llvm::MapVector<const Decl *,
       std::pair<const FieldDecl *, Address>>;
+  /// Set of the parameters passed by value escaping OpenMP context.
+  using EscapedParamsTy = llvm::SmallPtrSet<const Decl *, 4>;
+  struct FunctionData {
+    DeclToAddrMapTy LocalVarData;
+    const RecordDecl *GlobalRecord = nullptr;
+    llvm::Value *GlobalRecordAddr = nullptr;
+    EscapedParamsTy EscapedParameters;
+    std::unique_ptr<CodeGenFunction::OMPMapVars> MappedParams;
+  };
   /// Maps the function to the list of the globalized variables with their
   /// addresses.
-  llvm::DenseMap<llvm::Function *,
-                 std::pair<const RecordDecl *, DeclToAddrMapTy>>
-      FunctionGlobalizedDecls;
-  /// Map from function to global record pointer.
-  llvm::DenseMap<llvm::Function *, llvm::Value *> FunctionToGlobalRecPtr;
+  llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls;
 };
 
 } // CodeGen namespace.
index 9d077a2be16cc398ae00020f811a9f1116c119af..02bb8ae88a12726482bdf122d64e9808e8cd6983 100644 (file)
@@ -1262,7 +1262,7 @@ static void emitEmptyBoundParameters(CodeGenFunction &,
 
 void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) {
   // Emit parallel region as a standalone region.
-  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
     OMPPrivateScope PrivateScope(CGF);
     bool Copyins = CGF.EmitOMPCopyinClause(S);
     (void)CGF.EmitOMPFirstprivateClause(S, PrivateScope);
@@ -1277,6 +1277,7 @@ void CodeGenFunction::EmitOMPParallelDirective(const OMPParallelDirective &S) {
     CGF.EmitOMPPrivateClause(S, PrivateScope);
     CGF.EmitOMPReductionClauseInit(S, PrivateScope);
     (void)PrivateScope.Privatize();
+    Action.Enter(CGF);
     CGF.EmitStmt(S.getCapturedStmt(OMPD_parallel)->getCapturedStmt());
     CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_parallel);
   };
@@ -2100,7 +2101,8 @@ emitInnerParallelForWhenCombined(CodeGenFunction &CGF,
                                  const OMPLoopDirective &S,
                                  CodeGenFunction::JumpDest LoopExit) {
   auto &&CGInlinedWorksharingLoop = [&S](CodeGenFunction &CGF,
-                                         PrePostActionTy &) {
+                                         PrePostActionTy &Action) {
+    Action.Enter(CGF);
     bool HasCancel = false;
     if (!isOpenMPSimdDirective(S.getDirectiveKind())) {
       if (const auto *D = dyn_cast<OMPTeamsDistributeParallelForDirective>(&S))
@@ -2682,7 +2684,8 @@ void CodeGenFunction::EmitOMPParallelForDirective(
     const OMPParallelForDirective &S) {
   // Emit directive as a combined directive that consists of two implicit
   // directives: 'parallel' with 'for' directive.
-  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    Action.Enter(CGF);
     OMPCancelStackRAII CancelRegion(CGF, OMPD_parallel_for, S.hasCancel());
     CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(), emitForLoopBounds,
                                emitDispatchForLoopBounds);
@@ -2695,7 +2698,8 @@ void CodeGenFunction::EmitOMPParallelForSimdDirective(
     const OMPParallelForSimdDirective &S) {
   // Emit directive as a combined directive that consists of two implicit
   // directives: 'parallel' with 'for' directive.
-  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    Action.Enter(CGF);
     CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(), emitForLoopBounds,
                                emitDispatchForLoopBounds);
   };
@@ -2707,7 +2711,8 @@ void CodeGenFunction::EmitOMPParallelSectionsDirective(
     const OMPParallelSectionsDirective &S) {
   // Emit directive as a combined directive that consists of two implicit
   // directives: 'parallel' with 'sections' directive.
-  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    Action.Enter(CGF);
     CGF.EmitSections(S);
   };
   emitCommonOMPParallelDirective(*this, S, OMPD_sections, CodeGen,
@@ -4033,12 +4038,13 @@ static void emitCommonOMPTeamsDirective(CodeGenFunction &CGF,
 
 void CodeGenFunction::EmitOMPTeamsDirective(const OMPTeamsDirective &S) {
   // Emit teams region as a standalone region.
-  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
     OMPPrivateScope PrivateScope(CGF);
     (void)CGF.EmitOMPFirstprivateClause(S, PrivateScope);
     CGF.EmitOMPPrivateClause(S, PrivateScope);
     CGF.EmitOMPReductionClauseInit(S, PrivateScope);
     (void)PrivateScope.Privatize();
+    Action.Enter(CGF);
     CGF.EmitStmt(S.getCapturedStmt(OMPD_teams)->getCapturedStmt());
     CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams);
   };
@@ -4099,10 +4105,11 @@ emitTargetTeamsDistributeRegion(CodeGenFunction &CGF, PrePostActionTy &Action,
 
   // Emit teams region as a standalone region.
   auto &&CodeGen = [&S, &CodeGenDistribute](CodeGenFunction &CGF,
-                                            PrePostActionTy &) {
+                                            PrePostActionTy &Action) {
     CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
     CGF.EmitOMPReductionClauseInit(S, PrivateScope);
     (void)PrivateScope.Privatize();
+    Action.Enter(CGF);
     CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_distribute,
                                                     CodeGenDistribute);
     CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams);
@@ -4144,10 +4151,11 @@ static void emitTargetTeamsDistributeSimdRegion(
 
   // Emit teams region as a standalone region.
   auto &&CodeGen = [&S, &CodeGenDistribute](CodeGenFunction &CGF,
-                                            PrePostActionTy &) {
+                                            PrePostActionTy &Action) {
     CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
     CGF.EmitOMPReductionClauseInit(S, PrivateScope);
     (void)PrivateScope.Privatize();
+    Action.Enter(CGF);
     CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_distribute,
                                                     CodeGenDistribute);
     CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams);
@@ -4188,10 +4196,11 @@ void CodeGenFunction::EmitOMPTeamsDistributeDirective(
 
   // Emit teams region as a standalone region.
   auto &&CodeGen = [&S, &CodeGenDistribute](CodeGenFunction &CGF,
-                                            PrePostActionTy &) {
+                                            PrePostActionTy &Action) {
     OMPPrivateScope PrivateScope(CGF);
     CGF.EmitOMPReductionClauseInit(S, PrivateScope);
     (void)PrivateScope.Privatize();
+    Action.Enter(CGF);
     CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_distribute,
                                                     CodeGenDistribute);
     CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams);
@@ -4209,10 +4218,11 @@ void CodeGenFunction::EmitOMPTeamsDistributeSimdDirective(
 
   // Emit teams region as a standalone region.
   auto &&CodeGen = [&S, &CodeGenDistribute](CodeGenFunction &CGF,
-                                            PrePostActionTy &) {
+                                            PrePostActionTy &Action) {
     OMPPrivateScope PrivateScope(CGF);
     CGF.EmitOMPReductionClauseInit(S, PrivateScope);
     (void)PrivateScope.Privatize();
+    Action.Enter(CGF);
     CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_simd,
                                                     CodeGenDistribute);
     CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams);
@@ -4231,10 +4241,11 @@ void CodeGenFunction::EmitOMPTeamsDistributeParallelForDirective(
 
   // Emit teams region as a standalone region.
   auto &&CodeGen = [&S, &CodeGenDistribute](CodeGenFunction &CGF,
-                                            PrePostActionTy &) {
+                                            PrePostActionTy &Action) {
     OMPPrivateScope PrivateScope(CGF);
     CGF.EmitOMPReductionClauseInit(S, PrivateScope);
     (void)PrivateScope.Privatize();
+    Action.Enter(CGF);
     CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_distribute,
                                                     CodeGenDistribute);
     CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams);
@@ -4253,10 +4264,11 @@ void CodeGenFunction::EmitOMPTeamsDistributeParallelForSimdDirective(
 
   // Emit teams region as a standalone region.
   auto &&CodeGen = [&S, &CodeGenDistribute](CodeGenFunction &CGF,
-                                            PrePostActionTy &) {
+                                            PrePostActionTy &Action) {
     OMPPrivateScope PrivateScope(CGF);
     CGF.EmitOMPReductionClauseInit(S, PrivateScope);
     (void)PrivateScope.Privatize();
+    Action.Enter(CGF);
     CGF.CGM.getOpenMPRuntime().emitInlinedDirective(
         CGF, OMPD_distribute, CodeGenDistribute, /*HasCancel=*/false);
     CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams);
@@ -4277,10 +4289,11 @@ static void emitTargetTeamsDistributeParallelForRegion(
 
   // Emit teams region as a standalone region.
   auto &&CodeGenTeams = [&S, &CodeGenDistribute](CodeGenFunction &CGF,
-                                                 PrePostActionTy &) {
+                                                 PrePostActionTy &Action) {
     CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
     CGF.EmitOMPReductionClauseInit(S, PrivateScope);
     (void)PrivateScope.Privatize();
+    Action.Enter(CGF);
     CGF.CGM.getOpenMPRuntime().emitInlinedDirective(
         CGF, OMPD_distribute, CodeGenDistribute, /*HasCancel=*/false);
     CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams);
@@ -4328,10 +4341,11 @@ static void emitTargetTeamsDistributeParallelForSimdRegion(
 
   // Emit teams region as a standalone region.
   auto &&CodeGenTeams = [&S, &CodeGenDistribute](CodeGenFunction &CGF,
-                                                 PrePostActionTy &) {
+                                                 PrePostActionTy &Action) {
     CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
     CGF.EmitOMPReductionClauseInit(S, PrivateScope);
     (void)PrivateScope.Privatize();
+    Action.Enter(CGF);
     CGF.CGM.getOpenMPRuntime().emitInlinedDirective(
         CGF, OMPD_distribute, CodeGenDistribute, /*HasCancel=*/false);
     CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams);
@@ -4599,12 +4613,13 @@ static void emitTargetParallelRegion(CodeGenFunction &CGF,
   // Get the captured statement associated with the 'parallel' region.
   auto *CS = S.getCapturedStmt(OMPD_parallel);
   Action.Enter(CGF);
-  auto &&CodeGen = [&S, CS](CodeGenFunction &CGF, PrePostActionTy &) {
+  auto &&CodeGen = [&S, CS](CodeGenFunction &CGF, PrePostActionTy &Action) {
     CodeGenFunction::OMPPrivateScope PrivateScope(CGF);
     (void)CGF.EmitOMPFirstprivateClause(S, PrivateScope);
     CGF.EmitOMPPrivateClause(S, PrivateScope);
     CGF.EmitOMPReductionClauseInit(S, PrivateScope);
     (void)PrivateScope.Privatize();
+    Action.Enter(CGF);
     // TODO: Add support for clauses.
     CGF.EmitStmt(CS->getCapturedStmt());
     CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_parallel);
@@ -4643,7 +4658,8 @@ static void emitTargetParallelForRegion(CodeGenFunction &CGF,
   Action.Enter(CGF);
   // Emit directive as a combined directive that consists of two implicit
   // directives: 'parallel' with 'for' directive.
-  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    Action.Enter(CGF);
     CodeGenFunction::OMPCancelStackRAII CancelRegion(
         CGF, OMPD_target_parallel_for, S.hasCancel());
     CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(), emitForLoopBounds,
@@ -4683,7 +4699,8 @@ emitTargetParallelForSimdRegion(CodeGenFunction &CGF,
   Action.Enter(CGF);
   // Emit directive as a combined directive that consists of two implicit
   // directives: 'parallel' with 'for' directive.
-  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+  auto &&CodeGen = [&S](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    Action.Enter(CGF);
     CGF.EmitOMPWorksharingLoop(S, S.getEnsureUpperBound(), emitForLoopBounds,
                                emitDispatchForLoopBounds);
   };
diff --git a/test/OpenMP/declare_target_codegen_globalization.cpp b/test/OpenMP/declare_target_codegen_globalization.cpp
new file mode 100644 (file)
index 0000000..db0df9b
--- /dev/null
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// expected-no-diagnostics
+
+int foo(int &a) { return a; }
+
+int bar() {
+  int a;
+  return foo(a);
+}
+
+// CHECK: define void @__omp_offloading_{{.*}}maini1{{.*}}_l[[@LINE+5]](i32* dereferenceable{{.*}})
+// CHECK-NOT: @__kmpc_data_sharing_push_stack
+
+int maini1() {
+  int a;
+#pragma omp target parallel map(from:a)
+  {
+    int b;
+    a = foo(b) + bar();
+  }
+  return a;
+}
+
+// parallel region
+// CHECK: define {{.*}}void @{{.*}}(i32* noalias {{.*}}, i32* noalias {{.*}}, i32* dereferenceable{{.*}})
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 0)
+// CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_ST:%struct[.].*]]*
+// CHECK: [[B_ADDR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[GLOBALS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: call {{.*}}[[FOO:@.*foo.*]](i32* dereferenceable{{.*}} [[B_ADDR]])
+// CHECK: call {{.*}}[[BAR:@.*bar.*]]()
+// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[RES]])
+// CHECK: ret void
+
+// CHECK: define {{.*}}[[FOO]](i32* dereferenceable{{.*}})
+// CHECK-NOT: @__kmpc_data_sharing_push_stack
+
+// CHECK: define {{.*}}[[BAR]]()
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 0)
+// CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_ST:%struct[.].*]]*
+// CHECK: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[GLOBALS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: call {{.*}}[[FOO]](i32* dereferenceable{{.*}} [[A_ADDR]])
+// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[RES]])
+// CHECK: ret i32
index b26d47c706acaf5f05c224e35b67dcb51afab42e..c530d19fd7bf4cfba76309c30f8f81995d18671e 100644 (file)
@@ -36,8 +36,12 @@ int main (int argc, char **argv) {
 // CK1:  store {{.+}} 0, {{.+}},
 // CK1:  store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]],
 // CK1-64:  [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}*
-// CK1-64:  store i{{[0-9]+}}* [[CONV]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
-// CK1-32:  store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK1:  call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 4, i16 0)
+// CK1-64:  [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]]
+// CK1-32:  [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]]
+// CK1:  [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CK1:  store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]],
+// CK1:  store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
 // CK1:  [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]],
 // CK1:  store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]],
 // CK1-NOT: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
@@ -49,6 +53,10 @@ int main (int argc, char **argv) {
 // CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{.+}}***,
 // CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}**,
 // CK1: store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]]
+// CK1: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{4|8}}, i16 0)
+// CK1: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]]
+// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CK1: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]],
 // CK1: store i8*** [[ARGCADDR]], i8**** [[ARGCADDR_PTR]],
 // CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{.+}}**, i{{.+}}*** [[ARGCADDR_PTR]],
 // CK1: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]],
@@ -104,8 +112,12 @@ int main (int argc, char **argv) {
 // CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32*
 // CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32*
 // CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32*
-// CK2-64:  store i{{[0-9]+}}* [[CONV]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
-// CK2-32:  store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK2:  call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 4, i16 0)
+// CK2-64:  [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]]
+// CK2-32:  [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]]
+// CK2:  [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CK2:  store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]],
+// CK2:  store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
 // CK2:  [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]],
 // CK2: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]],
 // CK2-NOT:  {{.+}} = call i32 @__kmpc_push_num_teams(
@@ -121,6 +133,10 @@ int main (int argc, char **argv) {
 // CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]],
 // CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]],
 // CK2: store i{{[0-9]+}}** [[ARGC]], i{{[0-9]+}}*** [[ARGCADDR]],
+// CK2: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{4|8}}, i16 0)
+// CK2: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]]
+// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CK2: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]],
 // CK2: store i{{[0-9]+}}*** [[ARGCADDR]], i{{[0-9]+}}**** [[ARGCADDR_PTR]],
 // CK2: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}***, i{{[0-9]+}}**** [[ARGCADDR_PTR]],
 // CK2: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]],