From: Alexey Bataev <a.bataev@hotmail.com>
Date: Tue, 5 Mar 2019 17:47:18 +0000 (+0000)
Subject: [OPENMP]Target region: emit const firstprivates as globals with constant
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8650dc5d838e74671340f7bca62beedf75ae8a54;p=clang

[OPENMP]Target region: emit const firstprivates as globals with constant
memory.

If the variable with the constant non-scalar type is firstprivatized in
the target region, the local copy is created with the data copying.
Instead, we allocate the copy in the constant memory and avoid extra
copying in the outlined target regions. This global copy is used in the
target regions without loss of the performance.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@355418 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/CodeGen/CGOpenMPRuntime.cpp b/lib/CodeGen/CGOpenMPRuntime.cpp
index 49ba14c813..64fdefd9a6 100644
--- a/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -2945,9 +2945,8 @@ Address CGOpenMPRuntime::emitThreadIDAddress(CodeGenFunction &CGF,
   return ThreadIDTemp;
 }
 
-llvm::Constant *
-CGOpenMPRuntime::getOrCreateInternalVariable(llvm::Type *Ty,
-                                             const llvm::Twine &Name) {
+llvm::Constant *CGOpenMPRuntime::getOrCreateInternalVariable(
+    llvm::Type *Ty, const llvm::Twine &Name, unsigned AddressSpace) {
   SmallString<256> Buffer;
   llvm::raw_svector_ostream Out(Buffer);
   Out << Name;
@@ -2962,7 +2961,8 @@ CGOpenMPRuntime::getOrCreateInternalVariable(llvm::Type *Ty,
   return Elem.second = new llvm::GlobalVariable(
              CGM.getModule(), Ty, /*IsConstant*/ false,
              llvm::GlobalValue::CommonLinkage, llvm::Constant::getNullValue(Ty),
-             Elem.first());
+             Elem.first(), /*InsertBefore=*/nullptr,
+             llvm::GlobalValue::NotThreadLocal, AddressSpace);
 }
 
 llvm::Value *CGOpenMPRuntime::getCriticalRegionLock(StringRef CriticalName) {
@@ -7285,9 +7285,14 @@ private:
     // A first private variable captured by reference will use only the
     // 'private ptr' and 'map to' flag. Return the right flags if the captured
     // declaration is known as first-private in this handler.
-    if (FirstPrivateDecls.count(Cap.getCapturedVar()))
+    if (FirstPrivateDecls.count(Cap.getCapturedVar())) {
+      if (Cap.getCapturedVar()->getType().isConstant(CGF.getContext()) &&
+          Cap.getCaptureKind() == CapturedStmt::VCK_ByRef)
+        return MappableExprsHandler::OMP_MAP_ALWAYS |
+               MappableExprsHandler::OMP_MAP_TO;
       return MappableExprsHandler::OMP_MAP_PRIVATE |
              MappableExprsHandler::OMP_MAP_TO;
+    }
     return MappableExprsHandler::OMP_MAP_TO |
            MappableExprsHandler::OMP_MAP_FROM;
   }
@@ -7914,9 +7919,6 @@ public:
       }
     } else {
       assert(CI.capturesVariable() && "Expected captured reference.");
-      CurBasePointers.push_back(CV);
-      CurPointers.push_back(CV);
-
       const auto *PtrTy = cast<ReferenceType>(RI.getType().getTypePtr());
       QualType ElementType = PtrTy->getPointeeType();
       CurSizes.push_back(CGF.getTypeSize(ElementType));
@@ -7924,6 +7926,24 @@ public:
       // default the value doesn't have to be retrieved. For an aggregate
       // type, the default is 'tofrom'.
       CurMapTypes.push_back(getMapModifiersForPrivateClauses(CI));
+      const VarDecl *VD = CI.getCapturedVar();
+      if (FirstPrivateDecls.count(VD) &&
+          VD->getType().isConstant(CGF.getContext())) {
+        llvm::Constant *Addr =
+            CGF.CGM.getOpenMPRuntime().registerTargetFirstprivateCopy(CGF, VD);
+        // Copy the value of the original variable to the new global copy.
+        CGF.Builder.CreateMemCpy(
+            CGF.MakeNaturalAlignAddrLValue(Addr, ElementType).getAddress(),
+            Address(CV, CGF.getContext().getTypeAlignInChars(ElementType)),
+            CurSizes.back(),
+            /*isVolatile=*/false);
+        // Use new global variable as the base pointers.
+        CurBasePointers.push_back(Addr);
+        CurPointers.push_back(Addr);
+      } else {
+        CurBasePointers.push_back(CV);
+        CurPointers.push_back(CV);
+      }
     }
     // Every default map produces a single argument which is a target parameter.
     CurMapTypes.back() |= OMP_MAP_TARGET_PARAM;
@@ -8725,6 +8745,40 @@ bool CGOpenMPRuntime::emitTargetGlobalVariable(GlobalDecl GD) {
   return false;
 }
 
+llvm::Constant *
+CGOpenMPRuntime::registerTargetFirstprivateCopy(CodeGenFunction &CGF,
+                                                const VarDecl *VD) {
+  assert(VD->getType().isConstant(CGM.getContext()) &&
+         "Expected constant variable.");
+  StringRef VarName;
+  llvm::Constant *Addr;
+  llvm::GlobalValue::LinkageTypes Linkage;
+  QualType Ty = VD->getType();
+  SmallString<128> Buffer;
+  {
+    unsigned DeviceID;
+    unsigned FileID;
+    unsigned Line;
+    getTargetEntryUniqueInfo(CGM.getContext(), VD->getLocation(), DeviceID,
+                             FileID, Line);
+    llvm::raw_svector_ostream OS(Buffer);
+    OS << "__omp_offloading_firstprivate_" << llvm::format("_%x", DeviceID)
+       << llvm::format("_%x_", FileID) << VD->getName() << "_l" << Line;
+    VarName = OS.str();
+  }
+  Linkage = llvm::GlobalValue::InternalLinkage;
+  Addr =
+      getOrCreateInternalVariable(CGM.getTypes().ConvertTypeForMem(Ty), VarName,
+                                  getDefaultFirstprivateAddressSpace());
+  cast<llvm::GlobalValue>(Addr)->setLinkage(Linkage);
+  CharUnits VarSize = CGM.getContext().getTypeSizeInChars(Ty);
+  CGM.addCompilerUsedGlobal(cast<llvm::GlobalValue>(Addr));
+  OffloadEntriesInfoManager.registerDeviceGlobalVarEntryInfo(
+      VarName, Addr, VarSize,
+      OffloadEntriesInfoManagerTy::OMPTargetGlobalVarEntryTo, Linkage);
+  return Addr;
+}
+
 void CGOpenMPRuntime::registerTargetGlobalVariable(const VarDecl *VD,
                                                    llvm::Constant *Addr) {
   llvm::Optional<OMPDeclareTargetDeclAttr::MapTypeTy> Res =
diff --git a/lib/CodeGen/CGOpenMPRuntime.h b/lib/CodeGen/CGOpenMPRuntime.h
index cbac691d52..58a2358e75 100644
--- a/lib/CodeGen/CGOpenMPRuntime.h
+++ b/lib/CodeGen/CGOpenMPRuntime.h
@@ -708,7 +708,8 @@ private:
   /// must be the same.
   /// \param Name Name of the variable.
   llvm::Constant *getOrCreateInternalVariable(llvm::Type *Ty,
-                                              const llvm::Twine &Name);
+                                              const llvm::Twine &Name,
+                                              unsigned AddressSpace = 0);
 
   /// Set of threadprivate variables with the generated initializer.
   llvm::StringSet<> ThreadPrivateWithDefinition;
@@ -761,6 +762,10 @@ private:
                             llvm::Function *TaskFunction, QualType SharedsTy,
                             Address Shareds, const OMPTaskDataTy &Data);
 
+  /// Returns default address space for the constant firstprivates, 0 by
+  /// default.
+  virtual unsigned getDefaultFirstprivateAddressSpace() const { return 0; }
+
 public:
   explicit CGOpenMPRuntime(CodeGenModule &CGM)
       : CGOpenMPRuntime(CGM, ".", ".") {}
@@ -1414,6 +1419,11 @@ public:
   virtual void registerTargetGlobalVariable(const VarDecl *VD,
                                             llvm::Constant *Addr);
 
+  /// Registers provided target firstprivate variable as global on the
+  /// target.
+  llvm::Constant *registerTargetFirstprivateCopy(CodeGenFunction &CGF,
+                                                 const VarDecl *VD);
+
   /// Emit the global \a GD if it is meaningful for the target. Returns
   /// if it was emitted successfully.
   /// \param GD Global to scan.
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index 17ee48fe01..534f91cd35 100644
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -4438,6 +4438,10 @@ CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD,
     if (Attr->getCaptureKind() == OMPC_map) {
       PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
                                                         LangAS::opencl_global);
+    } else if (Attr->getCaptureKind() == OMPC_firstprivate &&
+               PointeeTy.isConstant(CGM.getContext())) {
+      PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
+                                                        LangAS::opencl_generic);
     }
   }
   ArgType = CGM.getContext().getPointerType(PointeeTy);
@@ -4825,6 +4829,10 @@ void CGOpenMPRuntimeNVPTX::adjustTargetSpecificDataForLambdas(
   }
 }
 
+unsigned CGOpenMPRuntimeNVPTX::getDefaultFirstprivateAddressSpace() const {
+  return CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant);
+}
+
 // Get current CudaArch and ignore any unknown values
 static CudaArch getCudaArch(CodeGenModule &CGM) {
   if (!CGM.getTarget().hasFeature("ptx"))
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
index cc66c4659e..5e59b64ad1 100644
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -386,6 +386,10 @@ public:
   void checkArchForUnifiedAddressing(CodeGenModule &CGM,
                                      const OMPRequiresDecl *D) const override;
 
+  /// Returns default address space for the constant firstprivates, __constant__
+  /// address space by default.
+  unsigned getDefaultFirstprivateAddressSpace() const override;
+
 private:
   /// Track the execution mode when codegening directives within a target
   /// region. The appropriate mode (SPMD/NON-SPMD) is set on entry to the
diff --git a/lib/CodeGen/CGStmtOpenMP.cpp b/lib/CodeGen/CGStmtOpenMP.cpp
index 146525b835..020f0b2248 100644
--- a/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/lib/CodeGen/CGStmtOpenMP.cpp
@@ -725,6 +725,9 @@ bool CodeGenFunction::EmitOMPFirstprivateClause(const OMPExecutableDirective &D,
                                                 OMPPrivateScope &PrivateScope) {
   if (!HaveInsertPoint())
     return false;
+  bool DeviceConstTarget =
+      getLangOpts().OpenMPIsDevice &&
+      isOpenMPTargetExecutionDirective(D.getDirectiveKind());
   bool FirstprivateIsLastprivate = false;
   llvm::DenseSet<const VarDecl *> Lastprivates;
   for (const auto *C : D.getClausesOfKind<OMPLastprivateClause>()) {
@@ -754,6 +757,16 @@ bool CodeGenFunction::EmitOMPFirstprivateClause(const OMPExecutableDirective &D,
         ++InitsRef;
         continue;
       }
+      // Do not emit copy for firstprivate constant variables in target regions,
+      // captured by reference.
+      if (DeviceConstTarget && OrigVD->getType().isConstant(getContext()) &&
+          FD && FD->getType()->isReferenceType()) {
+        (void)CGM.getOpenMPRuntime().registerTargetFirstprivateCopy(*this,
+                                                                    OrigVD);
+        ++IRef;
+        ++InitsRef;
+        continue;
+      }
       FirstprivateIsLastprivate =
           FirstprivateIsLastprivate || ThisFirstprivateIsLastprivate;
       if (EmittedAsFirstprivate.insert(OrigVD->getCanonicalDecl()).second) {
diff --git a/test/OpenMP/nvptx_target_firstprivate_codegen.cpp b/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
index 8ffc08294a..c99b43c27f 100644
--- a/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
+++ b/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
@@ -13,9 +13,11 @@ struct TT {
   ty Y;
 };
 
-// TCHECK:  [[TT:%.+]] = type { i64, i8 }
-// TCHECK:  [[S1:%.+]] = type { double }
+// TCHECK-DAG:  [[TT:%.+]] = type { i64, i8 }
+// TCHECK-DAG:  [[TTII:%.+]] = type { i32, i32 }
+// TCHECK-DAG:  [[S1:%.+]] = type { double }
 
+// TCHECK: @__omp_offloading_firstprivate__{{.+}}_e_l28 = internal addrspace(4) global [[TTII]] zeroinitializer
 // TCHECK: @{{.*}}_$_{{.*}} = common global i32 0, !dbg !{{[0-9]+}}
 int foo(int n, double *ptr) {
   int a = 0;
@@ -23,16 +25,20 @@ int foo(int n, double *ptr) {
   float b[10];
   double c[5][10];
   TT<long long, char> d;
+  const TT<int, int> e = {n, n};
 
-#pragma omp target firstprivate(a) map(tofrom \
-                                       : b)
+#pragma omp target firstprivate(a, e) map(tofrom \
+                                          : b)
   {
     b[a] = a;
+    b[a] += e.X;
   }
 
-  // TCHECK:  define {{.*}}void @__omp_offloading_{{.+}}([10 x float] addrspace(1)* noalias [[B_IN:%.+]], i{{[0-9]+}} [[A_IN:%.+]])
+  // TCHECK:  define {{.*}}void @__omp_offloading_{{.+}}([10 x float] addrspace(1)* noalias [[B_IN:%.+]], i{{[0-9]+}} [[A_IN:%.+]], [[TTII]]* noalias [[E_IN:%.+]])
+  // TCHECK-NOT: alloca [[TTII]],
   // TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
-  // TCHECK-NOT:  alloca i{{[0-9]+}},
+  // TCHECK-NOT: alloca [[TTII]],
+  // TCHECK-NOT: alloca i{{[0-9]+}},
   // TCHECK-64:  call void @llvm.dbg.declare(metadata [10 x float] addrspace(1)** %{{.+}}, metadata !{{[0-9]+}}, metadata !DIExpression())
   // TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
   // TCHECK:  ret void
diff --git a/test/OpenMP/target_firstprivate_codegen.cpp b/test/OpenMP/target_firstprivate_codegen.cpp
index 4a2837b3c8..e1d5019a2b 100644
--- a/test/OpenMP/target_firstprivate_codegen.cpp
+++ b/test/OpenMP/target_firstprivate_codegen.cpp
@@ -38,30 +38,32 @@
 #ifndef HEADER
 #define HEADER
 
-template<typename tx, typename ty>
-struct TT{
+template <typename tx, typename ty>
+struct TT {
   tx X;
   ty Y;
 };
 
-// CHECK:  [[TT:%.+]] = type { i64, i8 }
-// CHECK:  [[S1:%.+]] = type { double }
+// CHECK-DAG:  [[TT:%.+]] = type { i64, i8 }
+// CHECK-DAG:  [[TTII:%.+]] = type { i32, i32 }
+// CHECK-DAG:  [[S1:%.+]] = type { double }
 
-// TCHECK:  [[TT:%.+]] = type { i64, i8 }
-// TCHECK:  [[S1:%.+]] = type { double }
+// TCHECK-DAG:  [[TT:%.+]] = type { i64, i8 }
+// TCHECK-DAG:  [[TTII:%.+]] = type { i32, i32 }
+// TCHECK-DAG:  [[S1:%.+]] = type { double }
 
+// CHECK-DAG:  [[FP_E:@__omp_offloading_firstprivate_.+_e_l76]] = internal global [[TTII]] zeroinitializer
 // CHECK-DAG:  [[SIZET:@.+]] = private unnamed_addr constant [1 x i{{32|64}}] [i[[SZ:32|64]] 4]
-// CHECK:  [[MAPT:@.+]] = private unnamed_addr constant [1 x i64] [i64 800]
+// CHECK-DAG:  [[MAPT:@.+]] = private unnamed_addr constant [1 x i64] [i64 800]
 // CHECK-DAG:  [[MAPT2:@.+]] = private unnamed_addr constant [9 x i64] [i64 800, i64 673, i64 288, i64 673, i64 673, i64 288, i64 288, i64 673, i64 673]
-// CHECK-DAG:  [[SIZET3:@.+]] = private unnamed_addr constant [1 x i{{32|64}}] zeroinitializer
-// CHECK-DAG:  [[MAPT3:@.+]] = private unnamed_addr constant [1 x i64] [i64 544]
+// CHECK-DAG:  [[SIZET3:@.+]] = private unnamed_addr constant [2 x i{{32|64}}] [i{{32|64}} 0, i{{32|64}} 8]
+// CHECK-DAG:  [[MAPT3:@.+]] = private unnamed_addr constant [2 x i64] [i64 544, i64 549]
 // CHECK-DAG:  [[MAPT4:@.+]] = private unnamed_addr constant [6 x i64] [i64 32, i64 281474976711171, i64 800, i64 288, i64 288, i64 673]
 // CHECK-DAG:  [[SIZET5:@.+]] = private unnamed_addr constant [3 x i{{32|64}}] [i[[SZ]] 4, i[[SZ]] 1, i[[SZ]] 40]
 // CHECK-DAG:  [[MAPT5:@.+]] = private unnamed_addr constant [3 x i64] [i64 800, i64 800, i64 673]
 // CHECK-DAG:  [[SIZET6:@.+]] = private unnamed_addr constant [2 x i{{32|64}}] [i[[SZ]] 4, i[[SZ]] 40]
 // CHECK-DAG:  [[MAPT6:@.+]] = private unnamed_addr constant [2 x i64] [i64 800, i64 673]
 
-
 // CHECK: define {{.*}}[[FOO:@.+]](
 int foo(int n, double *ptr) {
   int a = 0;
@@ -71,8 +73,9 @@ int foo(int n, double *ptr) {
   double c[5][10];
   double cn[5][n];
   TT<long long, char> d;
-  
-  #pragma omp target firstprivate(a)
+  const TT<int, int> e = {n, n};
+
+#pragma omp target firstprivate(a)
   {
   }
 
@@ -92,14 +95,14 @@ int foo(int n, double *ptr) {
   // CHECK:  [[BASE_PTR_ARR2:%.+]] = alloca [9 x i8*],
   // CHECK:  [[PTR_ARR2:%.+]] = alloca [9 x i8*],
   // CHECK:  [[SIZET2:%.+]] = alloca [9 x i{{[0-9]+}}],
-  // CHECK:  [[BASE_PTR_ARR3:%.+]] = alloca [1 x i8*],
-  // CHECK:  [[PTR_ARR3:%.+]] = alloca [1 x i8*],  
+  // CHECK:  [[BASE_PTR_ARR3:%.+]] = alloca [2 x i8*],
+  // CHECK:  [[PTR_ARR3:%.+]] = alloca [2 x i8*],
   // CHECK:  [[N_ADDR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[N_ADDR]],
   // CHECK-64:  [[N_EXT:%.+]] = zext i{{[0-9]+}} [[N_ADDR_VAL]] to i{{[0-9]+}}
   // CHECK:  [[SSAVE_RET:%.+]] = call i8* @llvm.stacksave()
   // CHECK:  store i8* [[SSAVE_RET]], i8** [[SSTACK]],
   // CHECK-64:  [[BN_VLA:%.+]] = alloca float, i{{[0-9]+}} [[N_EXT]],
-  // CHECK-32:  [[BN_VLA:%.+]] = alloca float, i{{[0-9]+}} [[N_ADDR_VAL]],  
+  // CHECK-32:  [[BN_VLA:%.+]] = alloca float, i{{[0-9]+}} [[N_ADDR_VAL]],
   // CHECK:  [[N_ADDR_VAL2:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[N_ADDR]],
   // CHECK-64:  [[N_EXT2:%.+]] = zext i{{[0-9]+}} [[N_ADDR_VAL2]] to i{{[0-9]+}}
   // CHECK-64:  [[CN_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} 5, [[N_EXT2]]
@@ -119,15 +122,15 @@ int foo(int n, double *ptr) {
   // CHECK:  [[BASE_PTR_GEP_ARG:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[BASE_PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
   // CHECK:  [[PTR_GEP_ARG:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
   // CHECK:  {{.+}} = call i32 @__tgt_target(i64 -1, {{.+}}, i32 1, i8** [[BASE_PTR_GEP_ARG]], i8** [[PTR_GEP_ARG]], i[[SZ]]* getelementptr inbounds ([1 x i[[SZ]]], [1 x i[[SZ]]]* [[SIZET]], i32 0, i32 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* [[MAPT]], i32 0, i32 0))
-  
+
   // TCHECK:  define weak void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A_IN:%.+]])
   // TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
   // TCHECK-NOT: alloca i{{[0-9]+}},
   // TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
   // TCHECK-NOT: store i{{[0-9]+}} %
-  // TCHECK:  ret void  
+  // TCHECK:  ret void
 
-#pragma omp target firstprivate(aa,b,bn,c,cn,d)
+#pragma omp target firstprivate(aa, b, bn, c, cn, d)
   {
     aa += 1;
     b[2] = 1.0;
@@ -135,7 +138,7 @@ int foo(int n, double *ptr) {
     c[1][2] = 1.0;
     cn[1][3] = 1.0;
     d.X = 1;
-    d.Y = 1;    
+    d.Y = 1;
   }
 
   // CHECK:  [[A2VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A2]],
@@ -143,7 +146,7 @@ int foo(int n, double *ptr) {
   // CHECK:  store i{{[0-9]+}} [[A2VAL]], i{{[0-9]+}}* [[A2CASTCONV]],
   // CHECK:  [[A2CAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A2CAST]],
   // CHECK-64:  [[BN_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} [[N_EXT]], 4
-  // CHECK-32:  [[BN_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} [[N_ADDR_VAL]], 4  
+  // CHECK-32:  [[BN_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} [[N_ADDR_VAL]], 4
   // CHECK-64:  [[CN_SIZE_1:%.+]] = mul{{.+}} i{{[0-9]+}} 5, [[N_EXT2]]
   // CHECK-32:  [[CN_SIZE_1:%.+]] = mul{{.+}} i{{[0-9]+}} 5, [[N_ADDR_VAL2]]
   // CHECK:  [[CN_SIZE_2:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SIZE_1]], 8
@@ -184,7 +187,7 @@ int foo(int n, double *ptr) {
   // CHECK:  store float* [[BN_VLA]], float** [[BCAST_TOPTR]],
   // CHECK: [[SIZE_GEPBN_3:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
   // CHECK:  store i{{[0-9]+}} [[BN_SIZE]], i{{[0-9]+}}* [[SIZE_GEPBN_3]]
-  
+
   // firstprivate(c): base_ptr = &c[0], ptr = &c[0], size = 400 (5*10*sizeof(double))
   // CHECK:  [[BASE_PTR_GEP2_4:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
   // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP2_4]] to [5 x [10 x double]]**
@@ -194,7 +197,7 @@ int foo(int n, double *ptr) {
   // CHECK:  store [5 x [10 x double]]* [[C]], [5 x [10 x double]]** [[BCAST_TOPTR]],
   // CHECK:  [[SIZE_GEPC_4:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
   // CHECK:  store i{{[0-9]+}} 400, i{{[0-9]+}}* [[SIZE_GEPC_4]],
-  
+
   // firstprivate(cn), 3 entries, 5, n, cn: (1) base_ptr = 5, ptr = 5, size = 8; (2) (1) base_ptr = n, ptr = n, size = 8; (3) base_ptr = &cn[0], ptr = &cn[0], size = 5*n*sizeof(double)
   // CHECK:  [[BASE_PTR_GEP2_5:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 5
   // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP2_5]] to i{{[0-9]+}}*
@@ -222,8 +225,8 @@ int foo(int n, double *ptr) {
   // CHECK:  store double* [[CN_VLA]], double** [[BCAST_TOPTR]],
   // CHECK:  [[SIZE_GEPCN_7:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 7
   // CHECK:  store i{{[0-9]+}} [[CN_SIZE_2]], i{{[0-9]+}}* [[SIZE_GEPCN_7]],
-  
-  // firstprivate(d): base_ptr = &d, ptr = &d, size = 16 
+
+  // firstprivate(d): base_ptr = &d, ptr = &d, size = 16
   // CHECK:  [[BASE_PTR_GEP2_8:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 8
   // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP2_8]] to [[TT]]**
   // CHECK:  store [[TT]]* [[D]], [[TT]]** [[BCAST_TOPTR]],
@@ -232,13 +235,12 @@ int foo(int n, double *ptr) {
   // CHECK:  store [[TT]]* [[D]], [[TT]]** [[BCAST_TOPTR]],
   // CHECK:  [[SIZE_GEPCN_8:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 8
   // CHECK:  store i{{[0-9]+}} {{[0-9]+}}, i{{[0-9]+}}* [[SIZE_GEPCN_8]],
-  
-  
+
   // CHECK:  [[BASE_PTR_GEP_ARG2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
   // CHECK:  [[PTR_GEP_ARG2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
   // CHECK:  [[SIZES_ARG2:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[SIZET2]],  i{{[0-9]+}} 0, i{{[0-9]+}} 0
   // CHECK: {{.+}} = call i32 @__tgt_target(i64 -1, {{.+}}, i32 9, i8** [[BASE_PTR_GEP_ARG2]], i8** [[PTR_GEP_ARG2]], i[[SZ]]* [[SIZES_ARG2]], i64* getelementptr inbounds ([9 x i64], [9 x i64]* [[MAPT2]], i32 0, i32 0))
-  
+
   // make sure that firstprivate variables are generated in all cases and that we use those instances for operations inside the
   // target region
   // TCHECK:  define {{.*}}void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A2_IN:%.+]], [10 x float]* {{.+}} [[B_IN:%.+]], i{{[0-9]+}} [[BN_SZ:%.+]], float* {{.+}} [[BN_IN:%.+]], [5 x [10 x double]]* {{.+}} [[C_IN:%.+]], i{{[0-9]+}} [[CN_SZ1:%.+]], i{{[0-9]+}} [[CN_SZ2:%.+]], double* {{.+}} [[CN_IN:%.+]], [[TT]]* {{.+}} [[D_IN:%.+]])
@@ -297,7 +299,7 @@ int foo(int n, double *ptr) {
   // TCHECK:  [[C_PRIV_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_PRIV]] to i8*
   // TCHECK:  [[C_IN_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_ADDR_REF]] to i8*
   // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[C_PRIV_BCAST]], i8* align {{[0-9]+}} [[C_IN_BCAST]],{{.+}})
-  
+
   // firstprivate(cn)
   // TCHECK:  [[CN_SZ:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SZ1_VAL]], [[CN_SZ2_VAL]]
   // TCHECK:  [[CN_PRIV:%.+]] = alloca double, i{{[0-9]+}} [[CN_SZ]],
@@ -306,32 +308,43 @@ int foo(int n, double *ptr) {
   // TCHECK:  [[CN_PRIV_BCAST:%.+]] = bitcast double* [[CN_PRIV]] to i8*
   // TCHECK:  [[CN_IN_BCAST:%.+]] = bitcast double* [[CN_ADDR_REF]] to i8*
   // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[CN_PRIV_BCAST]], i8* align {{[0-9]+}} [[CN_IN_BCAST]], i{{[0-9]+}} [[CN_SZ2_CPY]],{{.+}})
-  
+
   // firstprivate(d)
   // TCHECK:  [[D_PRIV_BCAST:%.+]] = bitcast [[TT]]* [[D_PRIV]] to i8*
   // TCHECK:  [[D_IN_BCAST:%.+]] = bitcast [[TT]]* [[D_ADDR_REF]] to i8*
   // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* align {{[0-9]+}} [[D_PRIV_BCAST]], i8* align {{[0-9]+}} [[D_IN_BCAST]],{{.+}})
 
-  
-  #pragma omp target firstprivate(ptr)
+#pragma omp target firstprivate(ptr, e)
   {
+    ptr[0] = e.X;
     ptr[0]++;
   }
   // CHECK:  [[PTR_ADDR_REF:%.+]] = load double*, double** [[PTR_ADDR]],
 
-  // CHECK:  [[BASE_PTR_GEP3_0:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[BASE_PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  [[FP_E_BC:%.+]] = bitcast [[TTII]]* [[FP_E]] to i8*
+  // CHECK:  [[E_BC:%.+]] = bitcast [[TTII]]* [[E:%.+]] to i8*
+  // CHECK:  call void @llvm.memcpy.p0i8.p0i8.i{{64|32}}(i8* {{.*}} [[FP_E_BC]], i8* {{.*}} [[E_BC]], i{{64|32}} 8, i1 false)
+  // CHECK:  [[BASE_PTR_GEP3_0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BASE_PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
   // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP3_0]] to double**
   // CHECK:  store double* [[PTR_ADDR_REF]], double** [[BCAST_TOPTR]],
-  // CHECK:  [[PTR_GEP3_0:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  [[PTR_GEP3_0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
   // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTR_GEP3_0]] to double**
   // CHECK:  store double* [[PTR_ADDR_REF]], double** [[BCAST_TOPTR]],
-
-  // CHECK:  [[BASE_PTR_GEP_ARG3:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[BASE_PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK:  [[PTR_GEP_ARG3:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-  // CHECK: {{.+}} = call i32 @__tgt_target(i64 -1, {{.+}}, i32 1, i8** [[BASE_PTR_GEP_ARG3]], i8** [[PTR_GEP_ARG3]], i[[SZ]]* getelementptr inbounds ([1 x i[[SZ]]], [1 x i[[SZ]]]* [[SIZET3]], i32 0, i32 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* [[MAPT3]], i32 0, i32 0))
-
-  // TCHECK:  define weak void @__omp_offloading_{{.+}}(double* [[PTR_IN:%.+]])
+  // CHECK:  [[BASE_PTR_GEP3_1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BASE_PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[BASE_PTR_GEP3_1]] to [[TTII]]**
+  // CHECK:  store [[TTII]]* [[FP_E]], [[TTII]]** [[BCAST_TOPTR]],
+  // CHECK:  [[PTR_GEP3_1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  [[BCAST_TOPTR:%.+]] = bitcast i8** [[PTR_GEP3_1]] to [[TTII]]**
+  // CHECK:  store [[TTII]]* [[FP_E]], [[TTII]]** [[BCAST_TOPTR]],
+
+  // CHECK:  [[BASE_PTR_GEP_ARG3:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BASE_PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  [[PTR_GEP_ARG3:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK: {{.+}} = call i32 @__tgt_target(i64 -1, {{.+}}, i32 2, i8** [[BASE_PTR_GEP_ARG3]], i8** [[PTR_GEP_ARG3]], i[[SZ]]* getelementptr inbounds ([2 x i[[SZ]]], [2 x i[[SZ]]]* [[SIZET3]], i32 0, i32 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* [[MAPT3]], i32 0, i32 0))
+
+  // TCHECK:  define weak void @__omp_offloading_{{.+}}(double* [[PTR_IN:%.+]], [[TTII]]* dereferenceable{{.+}} [[E:%.+]])
+  // TCHECK-NOT: alloca [[TTII]],
   // TCHECK:  [[PTR_ADDR:%.+]] = alloca double*,
+  // TCHECK-NOT: alloca [[TTII]],
   // TCHECK-NOT: alloca double*,
   // TCHECK:  store double* [[PTR_IN]], double** [[PTR_ADDR]],
   // TCHECK-NOT: store double* %
@@ -339,13 +352,12 @@ int foo(int n, double *ptr) {
   return a;
 }
 
-
-template<typename tx>
+template <typename tx>
 tx ftemplate(int n) {
   tx a = 0;
   tx b[10];
 
-#pragma omp target firstprivate(a,b)
+#pragma omp target firstprivate(a, b)
   {
     a += 1;
     b[2] += 1;
@@ -354,13 +366,12 @@ tx ftemplate(int n) {
   return a;
 }
 
-static
-int fstatic(int n) {
+static int fstatic(int n) {
   int a = 0;
   char aaa = 0;
   int b[10];
 
-#pragma omp target firstprivate(a,aaa,b)
+#pragma omp target firstprivate(a, aaa, b)
   {
     a += 1;
     aaa += 1;
@@ -398,11 +409,11 @@ int fstatic(int n) {
 struct S1 {
   double a;
 
-  int r1(int n){
-    int b = n+1;
+  int r1(int n) {
+    int b = n + 1;
     short int c[2][n];
 
-#pragma omp target firstprivate(b,c)
+#pragma omp target firstprivate(b, c)
     {
       this->a = (double)b + 1.5;
       c[1][1] = ++a;
@@ -499,7 +510,7 @@ struct S1 {
 
   // firstprivate(b)
   // TCHECK-NOT:  store i{{[0-9]+}} %
- 
+
   // TCHECK:  [[RET_STACK:%.+]] = call i8* @llvm.stacksave()
   // TCHECK:  store i8* [[RET_STACK:%.+]], i8** [[SSTACK]],
 
@@ -517,7 +528,6 @@ struct S1 {
   // TCHECK: call void @llvm.stackrestore(i8* [[RELOAD_SSTACK]])
   // TCHECK: ret void
 
-
   // static host function
   // CHECK:  define{{.+}} i32 {{.+}}(i{{[0-9]+}} {{.+}})
   // CHECK:  [[BASE_PTRS5:%.+]] = alloca [3 x i8*],
@@ -551,9 +561,7 @@ struct S1 {
   // CHECK:  call i32 @__tgt_target(i64 -1, {{.+}}, i32 3, i8** {{.+}}, i8** {{.+}}, i[[SZ]]* getelementptr inbounds ([3 x i[[SZ]]], [3 x i[[SZ]]]* [[SIZET5]], i32 0, i32 0), i64* getelementptr inbounds ([3 x i64], [3 x i64]* [[MAPT5]], i32 0, i32 0))
 };
 
-
-
-int bar(int n, double *ptr){
+int bar(int n, double *ptr) {
   int a = 0;
   a += foo(n, ptr);
   S1 S;