Merging r308455:

[clang] / lib / CodeGen / CGOpenMPRuntimeNVPTX.cpp
diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp

index 810e0e013e83e649b7616b853a153849d0ec235b..3ced05d08a47c995df79709f2af498fd9661f796 100644 (file)
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -56,6 +56,16 @@ enum OpenMPRTLFunctionNVPTX {
    /// lane_offset, int16_t shortCircuit),
    /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
    OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
+  /// \brief Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
+  /// int32_t num_vars, size_t reduce_size, void *reduce_data,
+  /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t
+  /// lane_offset, int16_t shortCircuit),
+  /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
+  /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
+  /// int32_t index, int32_t width),
+  /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t
+  /// index, int32_t width, int32_t reduce))
+  OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
    /// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
    OMPRTL_NVPTX__kmpc_end_reduce_nowait
  };
@@ -125,6 +135,9 @@ enum MachineConfiguration : unsigned {
    /// computed as log_2(WarpSize).
    LaneIDBits = 5,
    LaneIDMask = WarpSize - 1,
+
+  /// Global memory alignment for performance.
+  GlobalMemoryAlignment = 256,
  };
  
  enum NamedBarrier : unsigned {
@@ -694,6 +707,49 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
          FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
      break;
    }
+  case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
+    // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
+    // int32_t num_vars, size_t reduce_size, void *reduce_data,
+    // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
+    // lane_offset, int16_t shortCircuit),
+    // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
+    // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
+    // int32_t index, int32_t width),
+    // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad,
+    // int32_t index, int32_t width, int32_t reduce))
+    llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
+                                             CGM.Int16Ty, CGM.Int16Ty};
+    auto *ShuffleReduceFnTy =
+        llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
+                                /*isVarArg=*/false);
+    llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
+    auto *InterWarpCopyFnTy =
+        llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
+                                /*isVarArg=*/false);
+    llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy,
+                                                CGM.Int32Ty, CGM.Int32Ty};
+    auto *CopyToScratchpadFnTy =
+        llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams,
+                                /*isVarArg=*/false);
+    llvm::Type *LoadReduceTypeParams[] = {
+        CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty};
+    auto *LoadReduceFnTy =
+        llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams,
+                                /*isVarArg=*/false);
+    llvm::Type *TypeParams[] = {CGM.Int32Ty,
+                                CGM.Int32Ty,
+                                CGM.SizeTy,
+                                CGM.VoidPtrTy,
+                                ShuffleReduceFnTy->getPointerTo(),
+                                InterWarpCopyFnTy->getPointerTo(),
+                                CopyToScratchpadFnTy->getPointerTo(),
+                                LoadReduceFnTy->getPointerTo()};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
+    RTLFn = CGM.CreateRuntimeFunction(
+        FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait");
+    break;
+  }
    case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
      // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
      llvm::Type *TypeParams[] = {CGM.Int32Ty};
@@ -805,6 +861,7 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
        D, ThreadIDVar, InnermostKind, CodeGen);
    llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
    OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
+  OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
    OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
  
    return OutlinedFun;
@@ -966,24 +1023,39 @@ enum CopyAction : unsigned {
    RemoteLaneToThread,
    // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
    ThreadCopy,
+  // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
+  ThreadToScratchpad,
+  // ScratchpadToThread: Copy from a scratchpad array in global memory
+  // containing team-reduced data to a thread's stack.
+  ScratchpadToThread,
  };
  } // namespace
  
+struct CopyOptionsTy {
+  llvm::Value *RemoteLaneOffset;
+  llvm::Value *ScratchpadIndex;
+  llvm::Value *ScratchpadWidth;
+};
+
  /// Emit instructions to copy a Reduce list, which contains partially
  /// aggregated values, in the specified direction.
-static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF,
-                                  QualType ReductionArrayTy,
-                                  ArrayRef<const Expr *> Privates,
-                                  Address SrcBase, Address DestBase,
-                                  llvm::Value *RemoteLaneOffset = nullptr) {
+static void emitReductionListCopy(
+    CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
+    ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
+    CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
  
    auto &CGM = CGF.CGM;
    auto &C = CGM.getContext();
    auto &Bld = CGF.Builder;
  
+  auto *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
+  auto *ScratchpadIndex = CopyOptions.ScratchpadIndex;
+  auto *ScratchpadWidth = CopyOptions.ScratchpadWidth;
+
    // Iterates, element-by-element, through the source Reduce list and
    // make a copy.
    unsigned Idx = 0;
+  unsigned Size = Privates.size();
    for (auto &Private : Privates) {
      Address SrcElementAddr = Address::invalid();
      Address DestElementAddr = Address::invalid();
@@ -993,6 +1065,10 @@ static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF,
      // Set to true to update the pointer in the dest Reduce list to a
      // newly created element.
      bool UpdateDestListPtr = false;
+    // Increment the src or dest pointer to the scratchpad, for each
+    // new element.
+    bool IncrScratchpadSrc = false;
+    bool IncrScratchpadDest = false;
  
      switch (Action) {
      case RemoteLaneToThread: {
@@ -1036,6 +1112,59 @@ static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF,
            DestElemAddr, CGF.ConvertTypeForMem(Private->getType()));
        break;
      }
+    case ThreadToScratchpad: {
+      // Step 1.1: Get the address for the src element in the Reduce list.
+      Address SrcElementPtrAddr =
+          Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
+      llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
+          SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
+      SrcElementAddr =
+          Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
+
+      // Step 1.2: Get the address for dest element:
+      // address = base + index * ElementSizeInChars.
+      unsigned ElementSizeInChars =
+          C.getTypeSizeInChars(Private->getType()).getQuantity();
+      auto *CurrentOffset =
+          Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
+                        ScratchpadIndex);
+      auto *ScratchPadElemAbsolutePtrVal =
+          Bld.CreateAdd(DestBase.getPointer(), CurrentOffset);
+      ScratchPadElemAbsolutePtrVal =
+          Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
+      Address ScratchpadPtr =
+          Address(ScratchPadElemAbsolutePtrVal,
+                  C.getTypeAlignInChars(Private->getType()));
+      DestElementAddr = Bld.CreateElementBitCast(
+          ScratchpadPtr, CGF.ConvertTypeForMem(Private->getType()));
+      IncrScratchpadDest = true;
+      break;
+    }
+    case ScratchpadToThread: {
+      // Step 1.1: Get the address for the src element in the scratchpad.
+      // address = base + index * ElementSizeInChars.
+      unsigned ElementSizeInChars =
+          C.getTypeSizeInChars(Private->getType()).getQuantity();
+      auto *CurrentOffset =
+          Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
+                        ScratchpadIndex);
+      auto *ScratchPadElemAbsolutePtrVal =
+          Bld.CreateAdd(SrcBase.getPointer(), CurrentOffset);
+      ScratchPadElemAbsolutePtrVal =
+          Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
+      SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
+                               C.getTypeAlignInChars(Private->getType()));
+      IncrScratchpadSrc = true;
+
+      // Step 1.2: Create a temporary to store the element in the destination
+      // Reduce list.
+      DestElementPtrAddr =
+          Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
+      DestElementAddr =
+          CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
+      UpdateDestListPtr = true;
+      break;
+    }
      }
  
      // Regardless of src and dest of copy, we emit the load of src
@@ -1069,10 +1198,253 @@ static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF,
                              C.VoidPtrTy);
      }
  
+    // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
+    // address of the next element in scratchpad memory, unless we're currently
+    // processing the last one.  Memory alignment is also taken care of here.
+    if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
+      llvm::Value *ScratchpadBasePtr =
+          IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
+      unsigned ElementSizeInChars =
+          C.getTypeSizeInChars(Private->getType()).getQuantity();
+      ScratchpadBasePtr = Bld.CreateAdd(
+          ScratchpadBasePtr,
+          Bld.CreateMul(ScratchpadWidth, llvm::ConstantInt::get(
+                                             CGM.SizeTy, ElementSizeInChars)));
+
+      // Take care of global memory alignment for performance
+      ScratchpadBasePtr = Bld.CreateSub(ScratchpadBasePtr,
+                                        llvm::ConstantInt::get(CGM.SizeTy, 1));
+      ScratchpadBasePtr = Bld.CreateSDiv(
+          ScratchpadBasePtr,
+          llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
+      ScratchpadBasePtr = Bld.CreateAdd(ScratchpadBasePtr,
+                                        llvm::ConstantInt::get(CGM.SizeTy, 1));
+      ScratchpadBasePtr = Bld.CreateMul(
+          ScratchpadBasePtr,
+          llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
+
+      if (IncrScratchpadDest)
+        DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
+      else /* IncrScratchpadSrc = true */
+        SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
+    }
+
      Idx++;
    }
  }
  
+/// This function emits a helper that loads data from the scratchpad array
+/// and (optionally) reduces it with the input operand.
+///
+///  load_and_reduce(local, scratchpad, index, width, should_reduce)
+///  reduce_data remote;
+///  for elem in remote:
+///    remote.elem = Scratchpad[elem_id][index]
+///  if (should_reduce)
+///    local = local @ remote
+///  else
+///    local = remote
+static llvm::Value *
+emitReduceScratchpadFunction(CodeGenModule &CGM,
+                             ArrayRef<const Expr *> Privates,
+                             QualType ReductionArrayTy, llvm::Value *ReduceFn) {
+  auto &C = CGM.getContext();
+  auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
+
+  // Destination of the copy.
+  ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  // Base address of the scratchpad array, with each element storing a
+  // Reduce list per team.
+  ImplicitParamDecl ScratchPadArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  // A source index into the scratchpad array.
+  ImplicitParamDecl IndexArg(C, Int32Ty, ImplicitParamDecl::Other);
+  // Row width of an element in the scratchpad array, typically
+  // the number of teams.
+  ImplicitParamDecl WidthArg(C, Int32Ty, ImplicitParamDecl::Other);
+  // If should_reduce == 1, then it's load AND reduce,
+  // If should_reduce == 0 (or otherwise), then it only loads (+ copy).
+  // The latter case is used for initialization.
+  ImplicitParamDecl ShouldReduceArg(C, Int32Ty, ImplicitParamDecl::Other);
+
+  FunctionArgList Args;
+  Args.push_back(&ReduceListArg);
+  Args.push_back(&ScratchPadArg);
+  Args.push_back(&IndexArg);
+  Args.push_back(&WidthArg);
+  Args.push_back(&ShouldReduceArg);
+
+  auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
+  auto *Fn = llvm::Function::Create(
+      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
+      "_omp_reduction_load_and_reduce", &CGM.getModule());
+  CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
+  CodeGenFunction CGF(CGM);
+  // We don't need debug information in this function as nothing here refers to
+  // user code.
+  CGF.disableDebugInfo();
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
+
+  auto &Bld = CGF.Builder;
+
+  // Get local Reduce list pointer.
+  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
+  Address ReduceListAddr(
+      Bld.CreatePointerBitCastOrAddrSpaceCast(
+          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
+                               C.VoidPtrTy, SourceLocation()),
+          CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
+      CGF.getPointerAlign());
+
+  Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
+  llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
+      AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
+
+  Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
+  llvm::Value *IndexVal =
+      Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false,
+                                             Int32Ty, SourceLocation()),
+                        CGM.SizeTy, /*isSigned=*/true);
+
+  Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
+  llvm::Value *WidthVal =
+      Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
+                                             Int32Ty, SourceLocation()),
+                        CGM.SizeTy, /*isSigned=*/true);
+
+  Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);
+  llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(
+      AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, SourceLocation());
+
+  // The absolute ptr address to the base addr of the next element to copy.
+  llvm::Value *CumulativeElemBasePtr =
+      Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
+  Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
+
+  // Create a Remote Reduce list to store the elements read from the
+  // scratchpad array.
+  Address RemoteReduceList =
+      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list");
+
+  // Assemble remote Reduce list from scratchpad array.
+  emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates,
+                        SrcDataAddr, RemoteReduceList,
+                        {/*RemoteLaneOffset=*/nullptr,
+                         /*ScratchpadIndex=*/IndexVal,
+                         /*ScratchpadWidth=*/WidthVal});
+
+  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
+  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
+  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
+
+  auto CondReduce = Bld.CreateICmpEQ(ShouldReduceVal, Bld.getInt32(1));
+  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
+
+  CGF.EmitBlock(ThenBB);
+  // We should reduce with the local Reduce list.
+  // reduce_function(LocalReduceList, RemoteReduceList)
+  llvm::Value *LocalDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListAddr.getPointer(), CGF.VoidPtrTy);
+  llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
+      RemoteReduceList.getPointer(), CGF.VoidPtrTy);
+  CGF.EmitCallOrInvoke(ReduceFn, {LocalDataPtr, RemoteDataPtr});
+  Bld.CreateBr(MergeBB);
+
+  CGF.EmitBlock(ElseBB);
+  // No reduction; just copy:
+  // Local Reduce list = Remote Reduce list.
+  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
+                        RemoteReduceList, ReduceListAddr);
+  Bld.CreateBr(MergeBB);
+
+  CGF.EmitBlock(MergeBB);
+
+  CGF.FinishFunction();
+  return Fn;
+}
+
+/// This function emits a helper that stores reduced data from the team
+/// master to a scratchpad array in global memory.
+///
+///  for elem in Reduce List:
+///    scratchpad[elem_id][index] = elem
+///
+static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
+                                         ArrayRef<const Expr *> Privates,
+                                         QualType ReductionArrayTy) {
+
+  auto &C = CGM.getContext();
+  auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
+
+  // Source of the copy.
+  ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  // Base address of the scratchpad array, with each element storing a
+  // Reduce list per team.
+  ImplicitParamDecl ScratchPadArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
+  // A destination index into the scratchpad array, typically the team
+  // identifier.
+  ImplicitParamDecl IndexArg(C, Int32Ty, ImplicitParamDecl::Other);
+  // Row width of an element in the scratchpad array, typically
+  // the number of teams.
+  ImplicitParamDecl WidthArg(C, Int32Ty, ImplicitParamDecl::Other);
+
+  FunctionArgList Args;
+  Args.push_back(&ReduceListArg);
+  Args.push_back(&ScratchPadArg);
+  Args.push_back(&IndexArg);
+  Args.push_back(&WidthArg);
+
+  auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
+  auto *Fn = llvm::Function::Create(
+      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
+      "_omp_reduction_copy_to_scratchpad", &CGM.getModule());
+  CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
+  CodeGenFunction CGF(CGM);
+  // We don't need debug information in this function as nothing here refers to
+  // user code.
+  CGF.disableDebugInfo();
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
+
+  auto &Bld = CGF.Builder;
+
+  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
+  Address SrcDataAddr(
+      Bld.CreatePointerBitCastOrAddrSpaceCast(
+          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
+                               C.VoidPtrTy, SourceLocation()),
+          CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
+      CGF.getPointerAlign());
+
+  Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
+  llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
+      AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
+
+  Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
+  llvm::Value *IndexVal =
+      Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false,
+                                             Int32Ty, SourceLocation()),
+                        CGF.SizeTy, /*isSigned=*/true);
+
+  Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
+  llvm::Value *WidthVal =
+      Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
+                                             Int32Ty, SourceLocation()),
+                        CGF.SizeTy, /*isSigned=*/true);
+
+  // The absolute ptr address to the base addr of the next element to copy.
+  llvm::Value *CumulativeElemBasePtr =
+      Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
+  Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
+
+  emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates,
+                        SrcDataAddr, DestDataAddr,
+                        {/*RemoteLaneOffset=*/nullptr,
+                         /*ScratchpadIndex=*/IndexVal,
+                         /*ScratchpadWidth=*/WidthVal});
+
+  CGF.FinishFunction();
+  return Fn;
+}
+
  /// This function emits a helper that gathers Reduce lists from the first
  /// lane of every active warp to lanes in the first warp.
  ///
@@ -1094,13 +1466,12 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
    // ReduceList: thread local Reduce list.
    // At the stage of the computation when this function is called, partially
    // aggregated values reside in the first lane of every active warp.
-  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, SourceLocation(),
-                                  /*Id=*/nullptr, C.VoidPtrTy);
+  ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
    // NumWarps: number of warps active in the parallel region.  This could
    // be smaller than 32 (max warps in a CTA) for partial block reduction.
-  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, SourceLocation(),
-                                /*Id=*/nullptr,
-                                C.getIntTypeForBitwidth(32, /* Signed */ true));
+  ImplicitParamDecl NumWarpsArg(C,
+                                C.getIntTypeForBitwidth(32, /* Signed */ true),
+                                ImplicitParamDecl::Other);
    FunctionArgList Args;
    Args.push_back(&ReduceListArg);
    Args.push_back(&NumWarpsArg);
@@ -1342,17 +1713,14 @@ emitShuffleAndReduceFunction(CodeGenModule &CGM,
    auto &C = CGM.getContext();
  
    // Thread local Reduce list used to host the values of data to be reduced.
-  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, SourceLocation(),
-                                  /*Id=*/nullptr, C.VoidPtrTy);
+  ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
    // Current lane id; could be logical.
-  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, SourceLocation(),
-                              /*Id=*/nullptr, C.ShortTy);
+  ImplicitParamDecl LaneIDArg(C, C.ShortTy, ImplicitParamDecl::Other);
    // Offset of the remote source lane relative to the current lane.
-  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, SourceLocation(),
-                                        /*Id=*/nullptr, C.ShortTy);
+  ImplicitParamDecl RemoteLaneOffsetArg(C, C.ShortTy,
+                                        ImplicitParamDecl::Other);
    // Algorithm version.  This is expected to be known at compile time.
-  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, SourceLocation(),
-                               /*Id=*/nullptr, C.ShortTy);
+  ImplicitParamDecl AlgoVerArg(C, C.ShortTy, ImplicitParamDecl::Other);
    FunctionArgList Args;
    Args.push_back(&ReduceListArg);
    Args.push_back(&LaneIDArg);
@@ -1402,7 +1770,9 @@ emitShuffleAndReduceFunction(CodeGenModule &CGM,
    // hosted on the thread's stack.
    emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
                          LocalReduceList, RemoteReduceList,
-                        RemoteLaneOffsetArgVal);
+                        {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
+                         /*ScratchpadIndex=*/nullptr,
+                         /*ScratchpadWidth=*/nullptr});
  
    // The actions to be performed on the Remote Reduce list is dependent
    // on the algorithm version.
@@ -1575,6 +1945,23 @@ emitShuffleAndReduceFunction(CodeGenModule &CGM,
  ///     reduced variables across warps.  It tunnels, through CUDA
  ///     shared memory, the thread-private data of type 'ReduceData'
  ///     from lane 0 of each warp to a lane in the first warp.
+/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
+///    The last team writes the global reduced value to memory.
+///
+///     ret = __kmpc_nvptx_teams_reduce_nowait(...,
+///             reduceData, shuffleReduceFn, interWarpCpyFn,
+///             scratchpadCopyFn, loadAndReduceFn)
+///
+///     'scratchpadCopyFn' is a helper that stores reduced
+///     data from the team master to a scratchpad array in
+///     global memory.
+///
+///     'loadAndReduceFn' is a helper that loads data from
+///     the scratchpad array and reduces it with the input
+///     operand.
+///
+///     These compiler generated functions hide address
+///     calculation and alignment information from the runtime.
  /// 5. if ret == 1:
  ///     The team master of the last team stores the reduced
  ///     result to the globals in memory.
@@ -1696,6 +2083,21 @@ emitShuffleAndReduceFunction(CodeGenModule &CGM,
  /// a mathematical sense) the problem of reduction across warp masters in
  /// a block to the problem of warp reduction.
  ///
+///
+/// Inter-Team Reduction
+///
+/// Once a team has reduced its data to a single value, it is stored in
+/// a global scratchpad array.  Since each team has a distinct slot, this
+/// can be done without locking.
+///
+/// The last team to write to the scratchpad array proceeds to reduce the
+/// scratchpad array.  One or more workers in the last team use the helper
+/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
+/// the k'th worker reduces every k'th element.
+///
+/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
+/// reduce across workers and compute a globally reduced value.
+///
  void CGOpenMPRuntimeNVPTX::emitReduction(
      CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
      ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
@@ -1704,7 +2106,10 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
      return;
  
    bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
-  assert(ParallelReduction && "Invalid reduction selection in emitReduction.");
+  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
+  // FIXME: Add support for simd reduction.
+  assert((TeamsReduction || ParallelReduction) &&
+         "Invalid reduction selection in emitReduction.");
  
    auto &C = CGM.getContext();
  
@@ -1777,6 +2182,25 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
          Args);
    }
  
+  if (TeamsReduction) {
+    auto *ScratchPadCopyFn =
+        emitCopyToScratchpad(CGM, Privates, ReductionArrayTy);
+    auto *LoadAndReduceFn = emitReduceScratchpadFunction(
+        CGM, Privates, ReductionArrayTy, ReductionFn);
+
+    llvm::Value *Args[] = {ThreadId,
+                           CGF.Builder.getInt32(RHSExprs.size()),
+                           ReductionArrayTySize,
+                           RL,
+                           ShuffleAndReduceFn,
+                           InterWarpCopyFn,
+                           ScratchPadCopyFn,
+                           LoadAndReduceFn};
+    Res = CGF.EmitRuntimeCall(
+        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait),
+        Args);
+  }
+
    // 5. Build switch(res)
    auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
    auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);