[OpenMP] Codegen for the 'target parallel' directive on the NVPTX device.

author Arpith Chacko Jacob <acjacob@us.ibm.com>

Wed, 18 Jan 2017 19:35:00 +0000 (19:35 +0000)

committer Arpith Chacko Jacob <acjacob@us.ibm.com>

Wed, 18 Jan 2017 19:35:00 +0000 (19:35 +0000)
author Arpith Chacko Jacob <acjacob@us.ibm.com>
Wed, 18 Jan 2017 19:35:00 +0000 (19:35 +0000)
committer Arpith Chacko Jacob <acjacob@us.ibm.com>
Wed, 18 Jan 2017 19:35:00 +0000 (19:35 +0000)
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp

index f03c0d9c3803cc65b832c600c02280ba14afdfa2..f87649c7d6b04e64e7f5b5d496256c560bf0b306 100644 (file)
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -26,6 +26,11 @@ enum OpenMPRTLFunctionNVPTX {
    OMPRTL_NVPTX__kmpc_kernel_init,
    /// \brief Call to void __kmpc_kernel_deinit();
    OMPRTL_NVPTX__kmpc_kernel_deinit,
+  /// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
+  /// short RequiresOMPRuntime, short RequiresDataSharing);
+  OMPRTL_NVPTX__kmpc_spmd_kernel_init,
+  /// \brief Call to void __kmpc_spmd_kernel_deinit();
+  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
    /// \brief Call to void __kmpc_kernel_prepare_parallel(void
    /// *outlined_function);
    OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
@@ -76,6 +81,25 @@ public:
      CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
    }
  };
+
+// A class to track the execution mode when codegening directives within
+// a target region. The appropriate mode (generic/spmd) is set on entry
+// to the target region and used by containing directives such as 'parallel'
+// to emit optimized code.
+class ExecutionModeRAII {
+private:
+  CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
+  CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
+
+public:
+  ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
+                    CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)
+      : Mode(Mode) {
+    SavedMode = Mode;
+    Mode = NewMode;
+  }
+  ~ExecutionModeRAII() { Mode = SavedMode; }
+};
  } // anonymous namespace
  
  /// Get the GPU warp size.
@@ -116,12 +140,17 @@ static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
  static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
  
  /// Get the value of the thread_limit clause in the teams directive.
-/// The runtime encodes thread_limit in the launch parameter, always starting
-/// thread_limit+warpSize threads per team.
-static llvm::Value *getThreadLimit(CodeGenFunction &CGF) {
+/// For the 'generic' execution mode, the runtime encodes thread_limit in
+/// the launch parameters, always starting thread_limit+warpSize threads per
+/// CTA. The threads in the last warp are reserved for master execution.
+/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
+static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
+                                   bool IsInSpmdExecutionMode = false) {
    CGBuilderTy &Bld = CGF.Builder;
-  return Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
-                       "thread_limit");
+  return IsInSpmdExecutionMode
+             ? getNVPTXNumThreads(CGF)
+             : Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
+                             "thread_limit");
  }
  
  /// Get the thread id of the OMP master thread.
@@ -159,12 +188,33 @@ void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
    CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
  }
  
+bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
+  return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
+}
+
+static CGOpenMPRuntimeNVPTX::ExecutionMode
+getExecutionModeForDirective(CodeGenModule &CGM,
+                             const OMPExecutableDirective &D) {
+  OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
+  switch (DirectiveKind) {
+  case OMPD_target:
+    return CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
+  case OMPD_target_parallel:
+    return CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
+  default:
+    llvm_unreachable("Unsupported directive on NVPTX device.");
+  }
+  llvm_unreachable("Unsupported directive on NVPTX device.");
+}
+
  void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
                                               StringRef ParentName,
                                               llvm::Function *&OutlinedFn,
                                               llvm::Constant *&OutlinedFnID,
                                               bool IsOffloadEntry,
                                               const RegionCodeGenTy &CodeGen) {
+  ExecutionModeRAII ModeRAII(CurrentExecutionMode,
+                             CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
    EntryFunctionState EST;
    WorkerFunctionState WST(CGM);
    Work.clear();
@@ -252,6 +302,94 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
    EST.ExitBB = nullptr;
  }
  
+void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
+                                          StringRef ParentName,
+                                          llvm::Function *&OutlinedFn,
+                                          llvm::Constant *&OutlinedFnID,
+                                          bool IsOffloadEntry,
+                                          const RegionCodeGenTy &CodeGen) {
+  ExecutionModeRAII ModeRAII(CurrentExecutionMode,
+                             CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
+  EntryFunctionState EST;
+
+  // Emit target region as a standalone region.
+  class NVPTXPrePostActionTy : public PrePostActionTy {
+    CGOpenMPRuntimeNVPTX &RT;
+    CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
+    const OMPExecutableDirective &D;
+
+  public:
+    NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
+                         CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
+                         const OMPExecutableDirective &D)
+        : RT(RT), EST(EST), D(D) {}
+    void Enter(CodeGenFunction &CGF) override {
+      RT.emitSpmdEntryHeader(CGF, EST, D);
+    }
+    void Exit(CodeGenFunction &CGF) override {
+      RT.emitSpmdEntryFooter(CGF, EST);
+    }
+  } Action(*this, EST, D);
+  CodeGen.setAction(Action);
+  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
+                                   IsOffloadEntry, CodeGen);
+  return;
+}
+
+void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
+    CodeGenFunction &CGF, EntryFunctionState &EST,
+    const OMPExecutableDirective &D) {
+  auto &Bld = CGF.Builder;
+
+  // Setup BBs in entry function.
+  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
+  EST.ExitBB = CGF.createBasicBlock(".exit");
+
+  // Initialize the OMP state in the runtime; called by all active threads.
+  // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters
+  // based on code analysis of the target region.
+  llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSpmdExecutionMode=*/true),
+                         /*RequiresOMPRuntime=*/Bld.getInt16(1),
+                         /*RequiresDataSharing=*/Bld.getInt16(1)};
+  CGF.EmitRuntimeCall(
+      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
+  CGF.EmitBranch(ExecuteBB);
+
+  CGF.EmitBlock(ExecuteBB);
+}
+
+void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
+                                               EntryFunctionState &EST) {
+  if (!EST.ExitBB)
+    EST.ExitBB = CGF.createBasicBlock(".exit");
+
+  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
+  CGF.EmitBranch(OMPDeInitBB);
+
+  CGF.EmitBlock(OMPDeInitBB);
+  // DeInitialize the OMP state in the runtime; called by all active threads.
+  CGF.EmitRuntimeCall(
+      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
+  CGF.EmitBranch(EST.ExitBB);
+
+  CGF.EmitBlock(EST.ExitBB);
+  EST.ExitBB = nullptr;
+}
+
+// Create a unique global variable to indicate the execution mode of this target
+// region. The execution mode is either 'generic', or 'spmd' depending on the
+// target directive. This variable is picked up by the offload library to setup
+// the device appropriately before kernel launch. If the execution mode is
+// 'generic', the runtime reserves one warp for the master, otherwise, all
+// warps participate in parallel work.
+static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
+                                     CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {
+  (void)new llvm::GlobalVariable(
+      CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
+      llvm::GlobalValue::WeakAnyLinkage,
+      llvm::ConstantInt::get(CGM.Int8Ty, Mode), Name + Twine("_exec_mode"));
+}
+
  void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
    auto &Ctx = CGM.getContext();
  
@@ -385,6 +523,22 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
      RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
      break;
    }
+  case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
+    // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
+    // short RequiresOMPRuntime, short RequiresDataSharing);
+    llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
+    break;
+  }
+  case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
+    // Build void __kmpc_spmd_kernel_deinit();
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
+    break;
+  }
    case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
      /// Build void __kmpc_kernel_prepare_parallel(
      /// void *outlined_function);
@@ -463,12 +617,27 @@ void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
  
    assert(!ParentName.empty() && "Invalid target region parent name!");
  
-  emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
-                    CodeGen);
+  CGOpenMPRuntimeNVPTX::ExecutionMode Mode =
+      getExecutionModeForDirective(CGM, D);
+  switch (Mode) {
+  case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
+    emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                      CodeGen);
+    break;
+  case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
+    emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                   CodeGen);
+    break;
+  case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:
+    llvm_unreachable(
+        "Unknown programming model for OpenMP directive on NVPTX target.");
+  }
+
+  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
  }
  
  CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
-    : CGOpenMPRuntime(CGM) {
+    : CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) {
    if (!CGM.getLangOpts().OpenMPIsDevice)
      llvm_unreachable("OpenMP NVPTX can only handle device code.");
  }
@@ -523,7 +692,10 @@ void CGOpenMPRuntimeNVPTX::emitParallelCall(
    if (!CGF.HaveInsertPoint())
      return;
  
-  emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
+  if (isInSpmdExecutionMode())
+    emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
+  else
+    emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
  }
  
  void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
@@ -593,3 +765,20 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
      ThenRCG(CGF);
    }
  }
+
+void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
+    CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
+    ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
+  // Just call the outlined function to execute the parallel region.
+  // OutlinedFn(&GTid, &zero, CapturedStruct);
+  //
+  // TODO: Do something with IfCond when support for the 'if' clause
+  // is added on Spmd target directives.
+  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
+  OutlinedFnArgs.push_back(
+      llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
+  OutlinedFnArgs.push_back(
+      llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
+  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
+  CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
+}
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h

index a69f051f162b9c18cc7156fc2835540196e5c44d..96478707fce33faa56160b604d1960be0cfaeddf 100644 (file)
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -43,6 +43,8 @@ private:
      void createWorkerFunction(CodeGenModule &CGM);
    };
  
+  bool isInSpmdExecutionMode() const;
+
    /// \brief Emit the worker function for the current target region.
    void emitWorkerFunction(WorkerFunctionState &WST);
  
@@ -58,6 +60,13 @@ private:
    /// function.
    void emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
  
+  /// \brief Helper for Spmd mode target directive's entry function.
+  void emitSpmdEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
+                           const OMPExecutableDirective &D);
+
+  /// \brief Signal termination of Spmd mode execution.
+  void emitSpmdEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
+
    /// \brief Returns specified OpenMP runtime function for the current OpenMP
    /// implementation.  Specialized for the NVPTX device.
    /// \param Function OpenMP runtime function.
@@ -87,6 +96,22 @@ private:
                           llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
                           const RegionCodeGenTy &CodeGen);
  
+  /// \brief Emit outlined function specialized for the Single Program
+  /// Multiple Data programming model for applicable target directives on the
+  /// NVPTX device.
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// \param CodeGen Object containing the target statements.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitSpmdKernel(const OMPExecutableDirective &D, StringRef ParentName,
+                      llvm::Function *&OutlinedFn,
+                      llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                      const RegionCodeGenTy &CodeGen);
+
    /// \brief Emit outlined function for 'target' directive on the NVPTX
    /// device.
    /// \param D Directive to emit.
@@ -118,6 +143,22 @@ private:
                                 ArrayRef<llvm::Value *> CapturedVars,
                                 const Expr *IfCond);
  
+  /// \brief Emits code for parallel or serial call of the \a OutlinedFn with
+  /// variables captured in a record which address is stored in \a
+  /// CapturedStruct.
+  /// This call is for a parallel directive within an SPMD target directive.
+  /// \param OutlinedFn Outlined function to be run in parallel threads. Type of
+  /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
+  /// \param CapturedVars A pointer to the record with the references to
+  /// variables used in \a OutlinedFn function.
+  /// \param IfCond Condition in the associated 'if' clause, if it was
+  /// specified, nullptr otherwise.
+  ///
+  void emitSpmdParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
+                            llvm::Value *OutlinedFn,
+                            ArrayRef<llvm::Value *> CapturedVars,
+                            const Expr *IfCond);
+
  protected:
    /// \brief Get the function name of an outlined region.
    //  The name can be customized depending on the target.
@@ -192,6 +233,25 @@ public:
                          llvm::Value *OutlinedFn,
                          ArrayRef<llvm::Value *> CapturedVars,
                          const Expr *IfCond) override;
+
+public:
+  /// Target codegen is specialized based on two programming models: the
+  /// 'generic' fork-join model of OpenMP, and a more GPU efficient 'spmd'
+  /// model for constructs like 'target parallel' that support it.
+  enum ExecutionMode {
+    /// Single Program Multiple Data.
+    Spmd,
+    /// Generic codegen to support fork-join model.
+    Generic,
+    Unknown,
+  };
+
+private:
+  // Track the execution mode when codegening directives within a target
+  // region. The appropriate mode (generic/spmd) is set on entry to the
+  // target region and used by containing directives such as 'parallel'
+  // to emit optimized code.
+  ExecutionMode CurrentExecutionMode;
  };
  
  } // CodeGen namespace.
diff --git a/test/OpenMP/nvptx_target_codegen.cpp b/test/OpenMP/nvptx_target_codegen.cpp

index 59c4d5b277ce4f9c849aa893b5dcbd38a12c6a4f..dab0c5a082a52e0e9a7dadc5ba9d33d5896ec4ee 100644 (file)
--- a/test/OpenMP/nvptx_target_codegen.cpp
+++ b/test/OpenMP/nvptx_target_codegen.cpp
@@ -8,6 +8,14 @@
  #ifndef HEADER
  #define HEADER
  
+// Check that the execution mode of all 6 target regions is set to Generic Mode.
+// CHECK-DAG: {{@__omp_offloading_.+l98}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l175}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l284}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l321}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l339}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l304}}_exec_mode = weak constant i8 1
+
  template<typename tx, typename ty>
  struct TT{
    tx X;
@@ -23,7 +31,7 @@ int foo(int n) {
    double cn[5][n];
    TT<long long, char> d;
  
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l90}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l98}}_worker()
    // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
    // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
    // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -54,7 +62,7 @@ int foo(int n) {
    // CHECK: [[EXIT]]
    // CHECK: ret void
  
-  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l90]]()
+  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l98]]()
    // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
    // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
    // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
@@ -96,7 +104,7 @@ int foo(int n) {
    {
    }
  
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l167}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l175}}_worker()
    // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
    // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
    // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -127,7 +135,7 @@ int foo(int n) {
    // CHECK: [[EXIT]]
    // CHECK: ret void
  
-  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l167]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]])
+  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l175]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]])
    // CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]],
    // CHECK: store i[[SZ]] [[ARG1]], i[[SZ]]* [[AA_ADDR]],
    // CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16*
@@ -169,7 +177,7 @@ int foo(int n) {
      aa += 1;
    }
  
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l276}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l284}}_worker()
    // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
    // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
    // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -200,7 +208,7 @@ int foo(int n) {
    // CHECK: [[EXIT]]
    // CHECK: ret void
  
-  // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l276]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l284]](i[[SZ]]
    // Create local storage for each capture.
    // CHECK:    [[LOCAL_A:%.+]] = alloca i[[SZ]]
    // CHECK:    [[LOCAL_B:%.+]] = alloca [10 x float]*
@@ -353,7 +361,7 @@ int bar(int n){
    return a;
  }
  
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+313}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+321}}_worker()
    // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
    // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
    // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -384,7 +392,7 @@ int bar(int n){
    // CHECK: [[EXIT]]
    // CHECK: ret void
  
-  // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l313]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l321]](i[[SZ]]
    // Create local storage for each capture.
    // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]]
    // CHECK:  [[LOCAL_AA:%.+]] = alloca i[[SZ]]
@@ -439,7 +447,7 @@ int bar(int n){
  
  
  
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l331}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l339}}_worker()
    // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
    // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
    // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -470,7 +478,7 @@ int bar(int n){
    // CHECK: [[EXIT]]
    // CHECK: ret void
  
-  // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l331]](
+  // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l339]](
    // Create local storage for each capture.
    // CHECK:       [[LOCAL_THIS:%.+]] = alloca [[S1:%struct.*]]*
    // CHECK:       [[LOCAL_B:%.+]] = alloca i[[SZ]]
@@ -529,7 +537,7 @@ int bar(int n){
  
  
  
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l296}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l304}}_worker()
    // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
    // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
    // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -560,7 +568,7 @@ int bar(int n){
    // CHECK: [[EXIT]]
    // CHECK: ret void
  
-  // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l296]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l304]](i[[SZ]]
    // Create local storage for each capture.
    // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]]
    // CHECK:  [[LOCAL_AA:%.+]] = alloca i[[SZ]]
diff --git a/test/OpenMP/nvptx_target_parallel_codegen.cpp b/test/OpenMP/nvptx_target_parallel_codegen.cpp

new file mode 100644 (file)

index 0000000..7d16624
--- /dev/null
+++ b/test/OpenMP/nvptx_target_parallel_codegen.cpp
@@ -0,0 +1,136 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// Check that the execution mode of all 2 target regions on the gpu is set to SPMD Mode.
+// CHECK-DAG: {{@__omp_offloading_.+l26}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l31}}_exec_mode = weak constant i8 0
+
+template<typename tx>
+tx ftemplate(int n) {
+  tx a = 0;
+  short aa = 0;
+  tx b[10];
+
+  #pragma omp target parallel if(target: 0)
+  {
+    a += 1;
+  }
+
+  #pragma omp target parallel map(tofrom: aa)
+  {
+    aa += 1;
+  }
+
+  #pragma omp target parallel map(tofrom:a, aa, b) if(target: n>40)
+  {
+    a += 1;
+    aa += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+int bar(int n){
+  int a = 0;
+
+  a += ftemplate<int>(n);
+
+  return a;
+}
+
+  // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}
+
+
+
+
+
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}(
+  // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
+  // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align
+  // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
+  // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]],
+  // CHECK: br label {{%?}}[[EXEC:.+]]
+  //
+  // CHECK: [[EXEC]]
+  // CHECK: {{call|invoke}} void [[OP1:@.+]](i32* null, i32* null, i16* [[AA]])
+  // CHECK: br label {{%?}}[[DONE:.+]]
+  //
+  // CHECK: [[DONE]]
+  // CHECK: call void @__kmpc_spmd_kernel_deinit()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+  // CHECK: }
+
+  // CHECK: define internal void [[OP1]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i16* {{[^%]*}}[[ARG:%.+]])
+  // CHECK: = alloca i32*, align
+  // CHECK: = alloca i32*, align
+  // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
+  // CHECK: store i16* [[ARG]], i16** [[AA_ADDR]], align
+  // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
+  // CHECK: [[VAL:%.+]] = load i16, i16* [[AA]], align
+  // CHECK: store i16 {{%.+}}, i16* [[AA]], align
+  // CHECK: ret void
+  // CHECK: }
+
+
+
+
+
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l31}}(
+  // CHECK: [[A_ADDR:%.+]] = alloca i32*, align
+  // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
+  // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align
+  // CHECK: store i32* {{%.+}}, i32** [[A_ADDR]], align
+  // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align
+  // CHECK: store [10 x i32]* {{%.+}}, [10 x i32]** [[B_ADDR]], align
+  // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align
+  // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
+  // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
+  // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]],
+  // CHECK: br label {{%?}}[[EXEC:.+]]
+  //
+  // CHECK: [[EXEC]]
+  // CHECK: {{call|invoke}} void [[OP2:@.+]](i32* null, i32* null, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]])
+  // CHECK: br label {{%?}}[[DONE:.+]]
+  //
+  // CHECK: [[DONE]]
+  // CHECK: call void @__kmpc_spmd_kernel_deinit()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+  // CHECK: }
+
+  // CHECK: define internal void [[OP2]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* {{[^%]*}}[[ARG1:%.+]], i16* {{[^%]*}}[[ARG2:%.+]], [10 x i32]* {{[^%]*}}[[ARG3:%.+]])
+  // CHECK: = alloca i32*, align
+  // CHECK: = alloca i32*, align
+  // CHECK: [[A_ADDR:%.+]] = alloca i32*, align
+  // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
+  // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align
+  // CHECK: store i32* [[ARG1]], i32** [[A_ADDR]], align
+  // CHECK: store i16* [[ARG2]], i16** [[AA_ADDR]], align
+  // CHECK: store [10 x i32]* [[ARG3]], [10 x i32]** [[B_ADDR]], align
+  // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align
+  // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
+  // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
+  // CHECK: store i32 {{%.+}}, i32* [[A]], align
+  // CHECK: store i16 {{%.+}}, i16* [[AA]], align
+  // CHECK: [[ELT:%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]],
+  // CHECK: store i32 {{%.+}}, i32* [[ELT]], align
+  // CHECK: ret void
+  // CHECK: }
+#endif
author	Arpith Chacko Jacob <acjacob@us.ibm.com>
	Wed, 18 Jan 2017 19:35:00 +0000 (19:35 +0000)
committer	Arpith Chacko Jacob <acjacob@us.ibm.com>
	Wed, 18 Jan 2017 19:35:00 +0000 (19:35 +0000)
lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp		patch \| blob \| history
lib/CodeGen/CGOpenMPRuntimeNVPTX.h		patch \| blob \| history
test/OpenMP/nvptx_target_codegen.cpp		patch \| blob \| history
test/OpenMP/nvptx_target_parallel_codegen.cpp	[new file with mode: 0644]	patch \| blob