]> granicus.if.org Git - clang/commitdiff
[OPENMP][NVPTX]Emit correct reduction code for teams/parallel
authorAlexey Bataev <a.bataev@hotmail.com>
Fri, 16 Nov 2018 19:38:21 +0000 (19:38 +0000)
committerAlexey Bataev <a.bataev@hotmail.com>
Fri, 16 Nov 2018 19:38:21 +0000 (19:38 +0000)
reductions.

Fixed previously committed code for the reduction support in
teams/parallel constructs taking into account new design of the NVPTX
support in the compiler. Teams reduction are not fully functional yet,
it is going to be fixed in the following patches.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@347081 91177308-0d34-0410-b5e6-96231b3b80d8

lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
lib/CodeGen/CGOpenMPRuntimeNVPTX.h
test/OpenMP/nvptx_data_sharing.cpp
test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
test/OpenMP/nvptx_parallel_codegen.cpp
test/OpenMP/nvptx_parallel_for_codegen.cpp
test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
test/OpenMP/nvptx_teams_codegen.cpp
test/OpenMP/nvptx_teams_reduction_codegen.cpp

index 8a9845489273eff44621b5f639d83a50034b1cb8..da1c0d2a3a9d0177af50e729ca69ef683dd1e635 100644 (file)
@@ -188,6 +188,28 @@ enum NamedBarrier : unsigned {
   NB_Parallel = 1,
 };
 
+static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
+  RefExpr = RefExpr->IgnoreParens();
+  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
+    const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
+    while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
+      Base = TempASE->getBase()->IgnoreParenImpCasts();
+    RefExpr = Base;
+  } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
+    const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
+    while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
+      Base = TempOASE->getBase()->IgnoreParenImpCasts();
+    while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
+      Base = TempASE->getBase()->IgnoreParenImpCasts();
+    RefExpr = Base;
+  }
+  RefExpr = RefExpr->IgnoreParenImpCasts();
+  if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
+    return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
+  const auto *ME = cast<MemberExpr>(RefExpr);
+  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
+}
+
 typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
 static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
   return P1.first > P2.first;
@@ -394,7 +416,10 @@ class CheckVarsEscapingDeclContext final
   }
 
 public:
-  CheckVarsEscapingDeclContext(CodeGenFunction &CGF) : CGF(CGF) {}
+  CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
+                               ArrayRef<const ValueDecl *> TeamsReductions)
+      : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
+  }
   virtual ~CheckVarsEscapingDeclContext() = default;
   void VisitDeclStmt(const DeclStmt *S) {
     if (!S)
@@ -614,8 +639,10 @@ static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
 
 /// Get barrier to synchronize all threads in a block.
 static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
-  CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
-      &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
+  llvm::Function *F = llvm::Intrinsic::getDeclaration(
+      &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0);
+  F->addFnAttr(llvm::Attribute::Convergent);
+  CGF.EmitRuntimeCall(F);
 }
 
 /// Get barrier #ID to synchronize selected (multiple of warp size) threads in
@@ -624,9 +651,10 @@ static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
                             llvm::Value *NumThreads) {
   CGBuilderTy &Bld = CGF.Builder;
   llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
-  CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
-                          &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),
-                      Args);
+  llvm::Function *F = llvm::Intrinsic::getDeclaration(
+      &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier);
+  F->addFnAttr(llvm::Attribute::Convergent);
+  CGF.EmitRuntimeCall(F, Args);
 }
 
 /// Synchronize all GPU threads in a block.
@@ -1965,10 +1993,20 @@ getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
   if (!Dir)
     return;
   for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
-    for (const Expr *E : C->getVarRefs()) {
-      const auto *DE = cast<DeclRefExpr>(E->IgnoreParens());
-      Vars.push_back(cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()));
-    }
+    for (const Expr *E : C->getVarRefs())
+      Vars.push_back(getPrivateItem(E));
+  }
+}
+
+/// Get list of reduction variables from the teams ... directives.
+static void
+getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
+                      llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
+  assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
+         "expected teams directive.");
+  for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
+    for (const Expr *E : C->privates())
+      Vars.push_back(getPrivateItem(E));
   }
 }
 
@@ -1978,13 +2016,22 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
   SourceLocation Loc = D.getBeginLoc();
 
   const RecordDecl *GlobalizedRD = nullptr;
-  llvm::SmallVector<const ValueDecl *, 4> LastPrivates;
+  llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
+  // Globalize team reductions variable unconditionally in all modes.
+  getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
   if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
-    getDistributeLastprivateVars(CGM.getContext(), D, LastPrivates);
-    if (!LastPrivates.empty())
+    getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
+    if (!LastPrivatesReductions.empty()) {
       GlobalizedRD = ::buildRecordForGlobalizedVars(
-          CGM.getContext(), llvm::None, LastPrivates, MappedDeclsFields);
+          CGM.getContext(), llvm::None, LastPrivatesReductions,
+          MappedDeclsFields);
+    }
+  } else if (!LastPrivatesReductions.empty()) {
+    assert(!TeamAndReductions.first &&
+           "Previous team declaration is not expected.");
+    TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
+    std::swap(TeamAndReductions.second, LastPrivatesReductions);
   }
 
   // Emit target region as a standalone region.
@@ -2162,7 +2209,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
             /*Volatile=*/false, Int16Ty, Loc);
         auto *StaticGlobalized = new llvm::GlobalVariable(
             CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
-            llvm::GlobalValue::WeakAnyLinkage, nullptr);
+            llvm::GlobalValue::CommonLinkage, nullptr);
         auto *RecSize = new llvm::GlobalVariable(
             CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
             llvm::GlobalValue::InternalLinkage, nullptr,
@@ -2801,11 +2848,12 @@ static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
           CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
           IntType, Offset, Loc);
       CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
-      Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
-      ElemPtr =
+      Address LocalPtr =
+          Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
+      Address LocalElemPtr =
           Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
-      PhiSrc->addIncoming(Ptr.getPointer(), ThenBB);
-      PhiDest->addIncoming(ElemPtr.getPointer(), ThenBB);
+      PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
+      PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
       CGF.EmitBranch(PreCondBB);
       CGF.EmitBlock(ExitBB);
     } else {
@@ -3228,10 +3276,9 @@ static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
       CGF.SizeTy, /*isSigned=*/true);
 
   Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
-  llvm::Value *WidthVal =
-      Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
-                                             Int32Ty, SourceLocation()),
-                        CGF.SizeTy, /*isSigned=*/true);
+  llvm::Value *WidthVal = Bld.CreateIntCast(
+      CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc),
+      CGF.SizeTy, /*isSigned=*/true);
 
   // The absolute ptr address to the base addr of the next element to copy.
   llvm::Value *CumulativeElemBasePtr =
@@ -3305,11 +3352,10 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
   llvm::GlobalVariable *TransferMedium =
       M.getGlobalVariable(TransferMediumName);
   if (!TransferMedium) {
-    auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
+    auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
     unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
     TransferMedium = new llvm::GlobalVariable(
-        M, Ty,
-        /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
+        M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
         llvm::Constant::getNullValue(Ty), TransferMediumName,
         /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
         SharedAddressSpace);
@@ -3327,7 +3373,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
   Address LocalReduceList(
       Bld.CreatePointerBitCastOrAddrSpaceCast(
           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
-                               C.VoidPtrTy, SourceLocation()),
+                               C.VoidPtrTy, Loc),
           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
       CGF.getPointerAlign());
 
@@ -3337,121 +3383,150 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
     // Warp master copies reduce element to transfer medium in __shared__
     // memory.
     //
-    llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
-    llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
-    llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
-
-    // if (lane_id == 0)
-    llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
-    Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
-    CGF.EmitBlock(ThenBB);
-
-    // Reduce element = LocalReduceList[i]
-    Address ElemPtrPtrAddr =
-        Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
-    llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
-        ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
-    // elemptr = (type[i]*)(elemptrptr)
-    Address ElemPtr =
-        Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
-    ElemPtr = Bld.CreateElementBitCast(
-        ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
-
-    // Get pointer to location in transfer medium.
-    // MediumPtr = &medium[warp_id]
-    llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
-        TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
-    Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
-    // Casting to actual data type.
-    // MediumPtr = (type[i]*)MediumPtrAddr;
-    MediumPtr = Bld.CreateElementBitCast(
-        MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
-
-    // elem = *elemptr
-    //*MediumPtr = elem
-    if (Private->getType()->isScalarType()) {
-      llvm::Value *Elem = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
-                                               Private->getType(), Loc);
-      // Store the source element value to the dest element address.
-      CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/false,
-                            Private->getType());
-    } else {
-      CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
-                            CGF.MakeAddrLValue(MediumPtr, Private->getType()),
-                            Private->getType(), AggValueSlot::DoesNotOverlap);
-    }
-
-    Bld.CreateBr(MergeBB);
-
-    CGF.EmitBlock(ElseBB);
-    Bld.CreateBr(MergeBB);
+    unsigned RealTySize =
+        C.getTypeSizeInChars(Private->getType())
+            .alignTo(C.getTypeAlignInChars(Private->getType()))
+            .getQuantity();
+    for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
+      unsigned NumIters = RealTySize / TySize;
+      if (NumIters == 0)
+        continue;
+      QualType CType = C.getIntTypeForBitwidth(
+          C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
+      llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
+      CharUnits Align = CharUnits::fromQuantity(TySize);
+      llvm::Value *Cnt = nullptr;
+      Address CntAddr = Address::invalid();
+      llvm::BasicBlock *PrecondBB = nullptr;
+      llvm::BasicBlock *ExitBB = nullptr;
+      if (NumIters > 1) {
+        CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
+        CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
+                              /*Volatile=*/false, C.IntTy);
+        PrecondBB = CGF.createBasicBlock("precond");
+        ExitBB = CGF.createBasicBlock("exit");
+        llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
+        // There is no need to emit line number for unconditional branch.
+        (void)ApplyDebugLocation::CreateEmpty(CGF);
+        CGF.EmitBlock(PrecondBB);
+        Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
+        llvm::Value *Cmp =
+            Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
+        Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
+        CGF.EmitBlock(BodyBB);
+      }
+      llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
+      llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
+      llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
 
-    CGF.EmitBlock(MergeBB);
+      // if (lane_id == 0)
+      llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
+      Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
+      CGF.EmitBlock(ThenBB);
 
-    Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
-    llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
-        AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
+      // Reduce element = LocalReduceList[i]
+      Address ElemPtrPtrAddr =
+          Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
+      llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
+          ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
+      // elemptr = ((CopyType*)(elemptrptr)) + I
+      Address ElemPtr = Address(ElemPtrPtr, Align);
+      ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
+      if (NumIters > 1) {
+        ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
+                          ElemPtr.getAlignment());
+      }
 
-    llvm::Value *NumActiveThreads = Bld.CreateNSWMul(
-        NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
-    // named_barrier_sync(ParallelBarrierID, num_active_threads)
-    syncParallelThreads(CGF, NumActiveThreads);
+      // Get pointer to location in transfer medium.
+      // MediumPtr = &medium[warp_id]
+      llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
+          TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
+      Address MediumPtr(MediumPtrVal, Align);
+      // Casting to actual data type.
+      // MediumPtr = (CopyType*)MediumPtrAddr;
+      MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
+
+      // elem = *elemptr
+      //*MediumPtr = elem
+      llvm::Value *Elem =
+          CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc);
+      // Store the source element value to the dest element address.
+      CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType);
+
+      Bld.CreateBr(MergeBB);
+
+      CGF.EmitBlock(ElseBB);
+      Bld.CreateBr(MergeBB);
+
+      CGF.EmitBlock(MergeBB);
+
+      Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
+      llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
+          AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
+
+      llvm::Value *NumActiveThreads = Bld.CreateNSWMul(
+          NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
+      // named_barrier_sync(ParallelBarrierID, num_active_threads)
+      syncParallelThreads(CGF, NumActiveThreads);
+
+      //
+      // Warp 0 copies reduce element from transfer medium.
+      //
+      llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
+      llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
+      llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
+
+      // Up to 32 threads in warp 0 are active.
+      llvm::Value *IsActiveThread =
+          Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
+      Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
+
+      CGF.EmitBlock(W0ThenBB);
+
+      // SrcMediumPtr = &medium[tid]
+      llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
+          TransferMedium,
+          {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
+      Address SrcMediumPtr(SrcMediumPtrVal, Align);
+      // SrcMediumVal = *SrcMediumPtr;
+      SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
+
+      // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
+      Address TargetElemPtrPtr =
+          Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
+      llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
+          TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
+      Address TargetElemPtr = Address(TargetElemPtrVal, Align);
+      TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
+      if (NumIters > 1) {
+        TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
+                                TargetElemPtr.getAlignment());
+      }
 
-    //
-    // Warp 0 copies reduce element from transfer medium.
-    //
-    llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
-    llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
-    llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
-
-    // Up to 32 threads in warp 0 are active.
-    llvm::Value *IsActiveThread =
-        Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
-    Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
-
-    CGF.EmitBlock(W0ThenBB);
-
-    // SrcMediumPtr = &medium[tid]
-    llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
-        TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
-    Address SrcMediumPtr(SrcMediumPtrVal,
-                         C.getTypeAlignInChars(Private->getType()));
-    // SrcMediumVal = *SrcMediumPtr;
-    SrcMediumPtr = Bld.CreateElementBitCast(
-        SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
-
-    // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
-    Address TargetElemPtrPtr =
-        Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
-    llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
-        TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
-    Address TargetElemPtr =
-        Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
-    TargetElemPtr = Bld.CreateElementBitCast(
-        TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
-
-    // *TargetElemPtr = SrcMediumVal;
-    if (Private->getType()->isScalarType()) {
-      llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
-          SrcMediumPtr, /*Volatile=*/false, Private->getType(), Loc);
+      // *TargetElemPtr = SrcMediumVal;
+      llvm::Value *SrcMediumValue =
+          CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
       CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
-                            Private->getType());
-    } else {
-      CGF.EmitAggregateCopy(
-          CGF.MakeAddrLValue(SrcMediumPtr, Private->getType()),
-          CGF.MakeAddrLValue(TargetElemPtr, Private->getType()),
-          Private->getType(), AggValueSlot::DoesNotOverlap);
-    }
-    Bld.CreateBr(W0MergeBB);
+                            CType);
+      Bld.CreateBr(W0MergeBB);
 
-    CGF.EmitBlock(W0ElseBB);
-    Bld.CreateBr(W0MergeBB);
+      CGF.EmitBlock(W0ElseBB);
+      Bld.CreateBr(W0MergeBB);
 
-    CGF.EmitBlock(W0MergeBB);
+      CGF.EmitBlock(W0MergeBB);
 
-    // While warp 0 copies values from transfer medium, all other warps must
-    // wait.
-    syncParallelThreads(CGF, NumActiveThreads);
+      // While warp 0 copies values from transfer medium, all other warps must
+      // wait.
+      syncParallelThreads(CGF, NumActiveThreads);
+      if (NumIters > 1) {
+        Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
+        CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
+        CGF.EmitBranch(PrecondBB);
+        (void)ApplyDebugLocation::CreateEmpty(CGF);
+        CGF.EmitBlock(ExitBB);
+      }
+      RealTySize %= TySize;
+    }
     ++Idx;
   }
 
@@ -3926,16 +4001,17 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
 
   bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
   bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
-  bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind);
-  assert((TeamsReduction || ParallelReduction || SimdReduction) &&
-         "Invalid reduction selection in emitReduction.");
 
   if (Options.SimpleReduction) {
+    assert(!TeamsReduction && !ParallelReduction &&
+           "Invalid reduction selection in emitReduction.");
     CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
                                    ReductionOps, Options);
     return;
   }
 
+  assert((TeamsReduction || ParallelReduction) &&
+         "Invalid reduction selection in emitReduction.");
   ASTContext &C = CGM.getContext();
 
   // 1. Build a list of reduction variables.
@@ -3993,24 +4069,20 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
   llvm::Value *InterWarpCopyFn =
       emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
 
-  llvm::Value *Args[] = {ThreadId,
-                         CGF.Builder.getInt32(RHSExprs.size()),
-                         ReductionArrayTySize,
-                         RL,
-                         ShuffleAndReduceFn,
-                         InterWarpCopyFn};
+  llvm::Value *Res;
+  if (ParallelReduction) {
+    llvm::Value *Args[] = {ThreadId,
+                           CGF.Builder.getInt32(RHSExprs.size()),
+                           ReductionArrayTySize,
+                           RL,
+                           ShuffleAndReduceFn,
+                           InterWarpCopyFn};
 
-  llvm::Value *Res = nullptr;
-  if (ParallelReduction)
     Res = CGF.EmitRuntimeCall(
         createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
         Args);
-  else if (SimdReduction)
-    Res = CGF.EmitRuntimeCall(
-        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_simd_reduce_nowait),
-        Args);
-
-  if (TeamsReduction) {
+  } else {
+    assert(TeamsReduction && "expected teams reduction.");
     llvm::Value *ScratchPadCopyFn =
         emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc);
     llvm::Value *LoadAndReduceFn = emitReduceScratchpadFunction(
@@ -4029,18 +4101,18 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
         Args);
   }
 
-  // 5. Build switch(res)
-  llvm::BasicBlock *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
-  llvm::SwitchInst *SwInst =
-      CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
+  // 5. Build if (res == 1)
+  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
+  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
+  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
+      Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
+  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
 
-  // 6. Build case 1: where we have reduced values in the master
+  // 6. Build then branch: where we have reduced values in the master
   //    thread in each team.
   //    __kmpc_end_reduce{_nowait}(<gtid>);
   //    break;
-  llvm::BasicBlock *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
-  SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
-  CGF.EmitBlock(Case1BB);
+  CGF.EmitBlock(ThenBB);
 
   // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
   llvm::Value *EndArgs[] = {ThreadId};
@@ -4064,8 +4136,9 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
       EndArgs);
   RCG.setAction(Action);
   RCG(CGF);
-  CGF.EmitBranch(DefaultBB);
-  CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
+  // There is no need to emit line number for unconditional branch.
+  (void)ApplyDebugLocation::CreateEmpty(CGF);
+  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
 }
 
 const VarDecl *
@@ -4292,6 +4365,8 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
   assert(D && "Expected function or captured|block decl.");
   assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
          "Function is registered already.");
+  assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
+         "Team is set but not processed.");
   const Stmt *Body = nullptr;
   bool NeedToDelayGlobalization = false;
   if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
@@ -4307,10 +4382,12 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
   }
   if (!Body)
     return;
-  CheckVarsEscapingDeclContext VarChecker(CGF);
+  CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
   VarChecker.Visit(Body);
   const RecordDecl *GlobalizedVarsRecord =
       VarChecker.getGlobalizedRecord(IsInTTDRegion);
+  TeamAndReductions.first = nullptr;
+  TeamAndReductions.second.clear();
   ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
       VarChecker.getEscapedVariableLengthDecls();
   if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
@@ -4331,7 +4408,7 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
     Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
   }
   if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
-    CheckVarsEscapingDeclContext VarChecker(CGF);
+    CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
     VarChecker.Visit(Body);
     I->getSecond().SecondaryGlobalRecord =
         VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
@@ -4583,7 +4660,7 @@ void CGOpenMPRuntimeNVPTX::clear() {
       llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
       auto *GV = new llvm::GlobalVariable(
           CGM.getModule(), LLVMStaticTy,
-          /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
+          /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
           llvm::Constant::getNullValue(LLVMStaticTy),
           "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
           llvm::GlobalValue::NotThreadLocal,
@@ -4609,7 +4686,7 @@ void CGOpenMPRuntimeNVPTX::clear() {
       llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
       auto *GV = new llvm::GlobalVariable(
           CGM.getModule(), LLVMArr2Ty,
-          /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
+          /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
           llvm::Constant::getNullValue(LLVMArr2Ty),
           "_openmp_static_glob_rd_$_");
       auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
index a157e421e060169380e36bb9f26d7496cf1e8905..aff9cf21135870d82061cb599269900a692b83e9 100644 (file)
@@ -431,6 +431,10 @@ private:
   /// Shared pointer for the global memory in the global memory buffer used for
   /// the given kernel.
   llvm::GlobalVariable *KernelStaticGlobalized = nullptr;
+  /// Pair of the Non-SPMD team and all reductions variables in this team
+  /// region.
+  std::pair<const Decl *, llvm::SmallVector<const ValueDecl *, 4>>
+      TeamAndReductions;
 };
 
 } // CodeGen namespace.
index 7f822aba921bec97ccdfd072d226b240a53bc37e..ed3c88b577165d7ecdfe774a3e70067ca0f3aa26 100644 (file)
@@ -27,7 +27,7 @@ void test_ds(){
   }
 }
 // CK1: [[MEM_TY:%.+]] = type { [8 x i8] }
-// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
 // CK1-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
 // CK1-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i64 8
 // CK1-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
index 71a3ad549170f7b1a7d24f94086515cee3855f77..a84962c4d815a80370767acfbbbda9a3137fb3aa 100644 (file)
@@ -22,7 +22,7 @@ int main(int argc, char **argv) {
 }
 
 // CHECK: [[MEM_TY:%.+]] = type { [84 x i8] }
-// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
 // CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
 // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 84
 // CHECK-DAG: @__omp_offloading_{{.*}}_main_l17_exec_mode = weak constant i8 1
index a3b1b012bf0068f911bff1b1e69739e6695793a8..2fd837c92b2d4d95e16776545ed61ea738a72b29 100644 (file)
@@ -72,7 +72,7 @@ int bar(int n){
 }
 
 // CHECK: [[MEM_TY:%.+]] = type { [4 x i8] }
-// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
 // CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
 // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
 // CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
index c292152e65818127ef9a0cbc54e198c2735c61b2..25a7a15693a2a03ef2c37adeb00a4ebaa715eeca 100644 (file)
@@ -31,7 +31,7 @@ int bar(int n){
 }
 
 // CHECK: [[MEM_TY:%.+]] = type { [4 x i8] }
-// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
 // CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
 // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
 // CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
index 0073db6f2884121494216ac8222c1ed3e9eccc19..1687c8ea761bf5a4ed5a96b66c15b9d84fca1cc7 100644 (file)
@@ -9,7 +9,7 @@
 #define HEADER
 
 // Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
-// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64]
+// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
 
 // Check that the execution mode of all 3 target regions is set to Spmd Mode.
 // CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 0
@@ -73,18 +73,16 @@ int bar(int n){
   // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align
   // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
   // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
-  // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [
-  // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]]
+  // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
+  // CHECK: br i1 [[CMP]], label
 
-  // CHECK: [[REDLABEL]]
   // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
   // CHECK: [[EV:%.+]] = load double, double* [[E]], align
   // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
   // CHECK: store double [[ADD]], double* [[E_IN]], align
   // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
-  // CHECK: br label %[[DEFAULTLABEL]]
+  // CHECK: br label
   //
-  // CHECK: [[DEFAULTLABEL]]
   // CHECK: ret
 
   //
@@ -187,18 +185,23 @@ int bar(int n){
   // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
   // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
   // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
+  // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
+  // CHECK: br label
+  // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
+  // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
+  // CHECK: br i1 [[DONE_COPY]], label
   // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
   // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
   //
   // [[DO_COPY]]
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
+  // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+  // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
   //
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
-  // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
-  // CHECK: store double [[ELT_VAL]], double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+  // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
+  // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
   //
   // CHECK: [[COPY_ELSE]]
@@ -215,13 +218,13 @@ int bar(int n){
   // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
   //
   // CHECK: [[DO_READ]]
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
+  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
-  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-  // CHECK: store double [[MEDIUM_ELT_VAL]], double* [[ELT]], align
+  // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+  // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
+  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
+  // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
   // CHECK: br label {{%?}}[[READ_CONT:.+]]
   //
   // CHECK: [[READ_ELSE]]
@@ -229,6 +232,9 @@ int bar(int n){
   //
   // CHECK: [[READ_CONT]]
   // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+  // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
+  // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
+  // CHECK: br label
   // CHECK: ret
 
 
@@ -268,10 +274,8 @@ int bar(int n){
   // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align
   // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
   // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
-  // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [
-  // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]]
-
-  // CHECK: [[REDLABEL]]
+  // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
+  // CHECK: br i1 [[CMP]], label
   // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align
   // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32
   // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align
@@ -284,9 +288,8 @@ int bar(int n){
   // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]]
   // CHECK: store float [[MUL]], float* [[D_IN]], align
   // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
-  // CHECK: br label %[[DEFAULTLABEL]]
+  // CHECK: br label
   //
-  // CHECK: [[DEFAULTLABEL]]
   // CHECK: ret
 
   //
@@ -432,10 +435,10 @@ int bar(int n){
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
   //
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
   // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
-  // CHECK: store i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
   //
   // CHECK: [[COPY_ELSE]]
@@ -452,11 +455,11 @@ int bar(int n){
   // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
   //
   // CHECK: [[DO_READ]]
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+  // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
   // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
   // CHECK: br label {{%?}}[[READ_CONT:.+]]
   //
@@ -471,12 +474,11 @@ int bar(int n){
   // [[DO_COPY]]
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
   //
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
-  // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
-  // CHECK: store float [[ELT_VAL]], float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+  // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+  // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
   //
   // CHECK: [[COPY_ELSE]]
@@ -493,13 +495,12 @@ int bar(int n){
   // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
   //
   // CHECK: [[DO_READ]]
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
+  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
-  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-  // CHECK: store float [[MEDIUM_ELT_VAL]], float* [[ELT]], align
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
   // CHECK: br label {{%?}}[[READ_CONT:.+]]
   //
   // CHECK: [[READ_ELSE]]
@@ -560,10 +561,9 @@ int bar(int n){
   // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
   // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
   // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
-  // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [
-  // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]]
+  // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
+  // CHECK: br i1 [[CMP]], label
 
-  // CHECK: [[REDLABEL]]
   // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
   // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align
   // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
@@ -587,9 +587,8 @@ int bar(int n){
   // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
   // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
   // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
-  // CHECK: br label %[[DEFAULTLABEL]]
+  // CHECK: br label
   //
-  // CHECK: [[DEFAULTLABEL]]
   // CHECK: ret
 
   //
@@ -752,10 +751,9 @@ int bar(int n){
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
   //
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
+  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
   // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
-  // CHECK: store i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
   //
   // CHECK: [[COPY_ELSE]]
@@ -772,12 +770,11 @@ int bar(int n){
   // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
   //
   // CHECK: [[DO_READ]]
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
+  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
-  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
   // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
   // CHECK: br label {{%?}}[[READ_CONT:.+]]
   //
@@ -794,10 +791,10 @@ int bar(int n){
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
   //
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+  // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
   // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
-  // CHECK: store i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
   //
   // CHECK: [[COPY_ELSE]]
@@ -814,12 +811,12 @@ int bar(int n){
   // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
   //
   // CHECK: [[DO_READ]]
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+  // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
-  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
   // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
   // CHECK: br label {{%?}}[[READ_CONT:.+]]
   //
index 8b8e0b0bba779a1a69992b263ec18ea5fb543c4a..79315717ae60a7103b44df2edf5f65edd158fc77 100644 (file)
@@ -68,7 +68,7 @@ int bar(int n){
 }
 
 // CHECK-DAG: [[MEM_TY:%.+]] = type { [4 x i8] }
-// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
 // CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
 // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
 // CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
index 395c10e6632d2db1cd1c5996018d2c7be586815a..2e9ceb129b47f5d8aeff903c6954525bb3c5bb9a 100644 (file)
@@ -63,7 +63,7 @@ int bar(int n){
 }
 
 // CHECK-DAG: [[MEM_TY:%.+]] = type { [4 x i8] }
-// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
 // CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
 // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
 // CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
index 98988a5d4dd88806c163a6d15b62286dc267852c..4965a50781c43d3676f674982edb5b0ad73e4179 100644 (file)
@@ -28,7 +28,7 @@ int main (int argc, char **argv) {
 }
 
 // CK1: [[MEM_TY:%.+]] = type { [{{4|8}} x i8] }
-// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
 // CK1-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
 // CK1-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4
 // CK1-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}}
@@ -115,7 +115,7 @@ int main (int argc, char **argv) {
 }
 
 // CK2: [[MEM_TY:%.+]] = type { [{{4|8}} x i8] }
-// CK2-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CK2-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
 // CK2-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
 // CK2-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4
 // CK2-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}}
index b73b5a8e93e53bebe771569274f66a5cc856d044..e2a103ab86de2ee6fc1c382ace71ce71b8e9f713 100644 (file)
@@ -8,13 +8,23 @@
 #ifndef HEADER
 #define HEADER
 
+// CHECK: [[MAP_TY:%.+]] = type { [16 x i8] }
+
+// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
+// CHECK-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1
+// CHECK-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1
+// CHECK-DAG: [[KERNEL_SHARED3:@.+]] = internal unnamed_addr constant i16 1
+// CHECK-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} {{16|8}}
+// CHECK-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} 16
+// CHECK-DAG: [[KERNEL_SIZE3:@.+]] = internal unnamed_addr constant i{{64|32}} 8
+
 // Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
-// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64]
+// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
 
-// Check that the execution mode of all 3 target regions is set to Generic Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 1
+// Check that the execution mode of 2 target regions is set to Non-SPMD and the 3rd is in SPMD.
+// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0
 
 template<typename tx>
 tx ftemplate(int n) {
@@ -39,6 +49,7 @@ tx ftemplate(int n) {
 
   #pragma omp target
   #pragma omp teams reduction(|: a) reduction(max: b)
+  #pragma omp parallel reduction(|: a) reduction(max: b)
   {
     a |= 1;
     b = 99 > b ? 99 : b;
@@ -55,9 +66,9 @@ int bar(int n){
   return a;
 }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l27}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l37}}_worker()
 
-  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l27]](
+  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l37]](
   //
   // CHECK: {{call|invoke}} void [[T1]]_worker()
   //
@@ -186,18 +197,23 @@ int bar(int n){
   // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
   // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
   // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
+  // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
+  // CHECK: br label
+  // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
+  // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
+  // CHECK: br i1 [[DONE_COPY]], label
   // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
   // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
   //
   // [[DO_COPY]]
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
+  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
+  // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+  // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
   //
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
-  // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
-  // CHECK: store double [[ELT_VAL]], double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+  // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
+  // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
   //
   // CHECK: [[COPY_ELSE]]
@@ -214,13 +230,13 @@ int bar(int n){
   // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
   //
   // CHECK: [[DO_READ]]
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
-  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
+  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
-  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-  // CHECK: store double [[MEDIUM_ELT_VAL]], double* [[ELT]], align
+  // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+  // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
+  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
+  // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
   // CHECK: br label {{%?}}[[READ_CONT:.+]]
   //
   // CHECK: [[READ_ELSE]]
@@ -228,6 +244,9 @@ int bar(int n){
   //
   // CHECK: [[READ_CONT]]
   // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+  // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
+  // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
+  // CHECK: br label
   // CHECK: ret
 
   //
@@ -307,9 +326,9 @@ int bar(int n){
   // CHECK: [[REDUCE_CONT]]
   // CHECK: ret
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker()
 
-  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l33]](
+  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l43]](
   //
   // CHECK: {{call|invoke}} void [[T2]]_worker()
   //
@@ -495,10 +514,10 @@ int bar(int n){
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
   //
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+  // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
   // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
-  // CHECK: store i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
   //
   // CHECK: [[COPY_ELSE]]
@@ -515,11 +534,11 @@ int bar(int n){
   // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
   //
   // CHECK: [[DO_READ]]
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+  // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
   // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
   // CHECK: br label {{%?}}[[READ_CONT:.+]]
   //
@@ -534,12 +553,11 @@ int bar(int n){
   // [[DO_COPY]]
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
   //
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
-  // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
-  // CHECK: store float [[ELT_VAL]], float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+  // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+  // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
   //
   // CHECK: [[COPY_ELSE]]
@@ -556,13 +574,12 @@ int bar(int n){
   // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
   //
   // CHECK: [[DO_READ]]
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
+  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
-  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
-  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
-  // CHECK: store float [[MEDIUM_ELT_VAL]], float* [[ELT]], align
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
   // CHECK: br label {{%?}}[[READ_CONT:.+]]
   //
   // CHECK: [[READ_ELSE]]
@@ -689,13 +706,60 @@ int bar(int n){
   // CHECK: [[REDUCE_CONT]]
   // CHECK: ret
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l40}}_worker()
-
-  // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+template.+l40]](
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l50}}(
+  //
+  // CHECK: call void @__kmpc_spmd_kernel_init(
+  // CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
+  // CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY:%.+]], %{{.+}} addrspace(3)* [[KERNEL_RD:@.+]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} {{8|16}}, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR:@.+]] to i8**))
+  // CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
+  // CHECK: [[GLOBAL_REC:%.+]] = bitcast i8* [[PTR]] to [[GLOB_REC_TY:%.+]]*
+  // CHECK-DAG: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOB_REC_TY]], [[GLOB_REC_TY]]* [[GLOBAL_REC]], i32 0, i32 0
+  // CHECK-DAG: [[B_ADDR:%.+]] = getelementptr inbounds [[GLOB_REC_TY]], [[GLOB_REC_TY]]* [[GLOBAL_REC]], i32 0, i32 1
+  // CHECK: store i32 0, i32* [[A_ADDR]],
+  // CHECK: store i16 -32768, i16* [[B_ADDR]],
+  // CHECK: call void [[OUTLINED:@.+]](i32* {{.+}}, i32* {{.+}}, i32* [[A_ADDR]], i16* [[B_ADDR]])
+  // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i[[SZ]] 0, i[[SZ]] 0
+  // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A_ADDR]] to i8*
+  // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align
+  // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1
+  // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B_ADDR]] to i8*
+  // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
+  // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
+  // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]])
+  // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
+  // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
   //
-  // CHECK: {{call|invoke}} void [[T3]]_worker()
+  // CHECK: [[IFLABEL]]
+  // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
+  // CHECK: [[AV:%.+]] = load i32, i32* [[A_ADDR]], align
+  // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
+  // CHECK: store i32 [[OR]], i32* [[A_IN]], align
+  // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
+  // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
+  // CHECK: [[BV16:%.+]] = load i16, i16* [[B_ADDR]], align
+  // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
+  // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
+  // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
   //
-  // CHECK: call void @__kmpc_kernel_init(
+  // CHECK: [[DO_MAX]]
+  // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
+  // CHECK: br label {{%?}}[[MAX_CONT:.+]]
+  //
+  // CHECK: [[MAX_ELSE]]
+  // CHECK: [[MAX2:%.+]] = load i16, i16* [[B_ADDR]], align
+  // CHECK: br label {{%?}}[[MAX_CONT]]
+  //
+  // CHECK: [[MAX_CONT]]
+  // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
+  // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
+  // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
+  // CHECK: br label %[[EXIT]]
+  //
+  // CHECK: [[EXIT]]
+  // call void @__kmpc_restore_team_static_memory(i16 1)
+  // CHECK: call void @__kmpc_spmd_kernel_deinit(
+
+  // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.+}}, i16* dereferenceable{{.+}})
   //
   // CHECK: store i32 0, i32* [[A:%.+]], align
   // CHECK: store i16 -32768, i16* [[B:%.+]], align
@@ -726,7 +790,7 @@ int bar(int n){
   // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8*
   // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
   // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
-  // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]])
+  // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[PAR_SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[PAR_WARP_COPY_FN:@.+]])
   // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
   // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
   //
@@ -757,7 +821,243 @@ int bar(int n){
   // CHECK: br label %[[EXIT]]
   //
   // CHECK: [[EXIT]]
-  // CHECK: call void @__kmpc_kernel_deinit(
+  // CHECK: ret void
+
+  //
+  // Reduction function
+  // CHECK: define internal void [[PAR_REDUCTION_FUNC:@.+]](i8*, i8*)
+  // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
+  // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
+  // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
+  //
+  // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
+  // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
+  // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
+  //
+  // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1
+  // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
+  // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
+  //
+  // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1
+  // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
+  // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
+  //
+  // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
+  // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
+  // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
+  // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
+  //
+  // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
+  // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
+  // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
+  // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
+  //
+  // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
+  // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
+  //
+  // CHECK: [[DO_MAX]]
+  // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
+  // CHECK: br label {{%?}}[[MAX_CONT:.+]]
+  //
+  // CHECK: [[MAX_ELSE]]
+  // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
+  // CHECK: br label {{%?}}[[MAX_CONT]]
+  //
+  // CHECK: [[MAX_CONT]]
+  // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
+  // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
+  // CHECK: ret void
+
+  //
+  // Shuffle and reduce function
+  // CHECK: define internal void [[PAR_SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
+  // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
+  // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
+  // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
+  //
+  // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
+  // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
+  // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
+  //
+  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
+  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+  // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+  // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+  //
+  // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
+  // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
+  //
+  // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
+  // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
+  // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
+  //
+  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
+  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+  // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+  // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
+  //
+  // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
+  // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
+  // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
+  // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
+  //
+  // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
+  // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
+  // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
+  //
+  // Condition to reduce
+  // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
+  //
+  // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+  // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
+  // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
+  //
+  // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
+  // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
+  // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
+  // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
+  // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
+  // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
+  //
+  // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
+  // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
+  // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
+  //
+  // CHECK: [[DO_REDUCE]]
+  // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
+  // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
+  // CHECK: call void [[PAR_REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
+  // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
+  //
+  // CHECK: [[REDUCE_ELSE]]
+  // CHECK: br label {{%?}}[[REDUCE_CONT]]
+  //
+  // CHECK: [[REDUCE_CONT]]
+  // Now check if we should just copy over the remote reduction list
+  // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+  // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
+  // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
+  // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+  //
+  // CHECK: [[DO_COPY]]
+  // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
+  // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
+  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+  // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+  // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
+  // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
+  //
+  // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
+  // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
+  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+  // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+  // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
+  // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
+  // CHECK: br label {{%?}}[[COPY_CONT:.+]]
+  //
+  // CHECK: [[COPY_ELSE]]
+  // CHECK: br label {{%?}}[[COPY_CONT]]
+  //
+  // CHECK: [[COPY_CONT]]
+  // CHECK: void
+
+  //
+  // Inter warp copy function
+  // CHECK: define internal void [[PAR_WARP_COPY_FN]](i8*, i32)
+  // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
+  // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
+  // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
+  // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
+  // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+  //
+  // [[DO_COPY]]
+  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
+  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+  //
+  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+  // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+  // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: br label {{%?}}[[COPY_CONT:.+]]
+  //
+  // CHECK: [[COPY_ELSE]]
+  // CHECK: br label {{%?}}[[COPY_CONT]]
+  //
+  // Barrier after copy to shared memory storage medium.
+  // CHECK: [[COPY_CONT]]
+  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
+  // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+  //
+  // Read into warp 0.
+  // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
+  // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
+  //
+  // CHECK: [[DO_READ]]
+  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
+  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
+  // CHECK: br label {{%?}}[[READ_CONT:.+]]
+  //
+  // CHECK: [[READ_ELSE]]
+  // CHECK: br label {{%?}}[[READ_CONT]]
+  //
+  // CHECK: [[READ_CONT]]
+  // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+  // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
+  // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+  //
+  // [[DO_COPY]]
+  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
+  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+  //
+  // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+  // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
+  // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: br label {{%?}}[[COPY_CONT:.+]]
+  //
+  // CHECK: [[COPY_ELSE]]
+  // CHECK: br label {{%?}}[[COPY_CONT]]
+  //
+  // Barrier after copy to shared memory storage medium.
+  // CHECK: [[COPY_CONT]]
+  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
+  // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+  //
+  // Read into warp 0.
+  // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
+  // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
+  //
+  // CHECK: [[DO_READ]]
+  // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+  // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
+  // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+  // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
+  // CHECK: br label {{%?}}[[READ_CONT:.+]]
+  //
+  // CHECK: [[READ_ELSE]]
+  // CHECK: br label {{%?}}[[READ_CONT]]
+  //
+  // CHECK: [[READ_CONT]]
+  // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+  // CHECK: ret
 
   //
   // Reduction function
@@ -919,10 +1219,9 @@ int bar(int n){
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
   //
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
+  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
   // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
-  // CHECK: store i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
   //
   // CHECK: [[COPY_ELSE]]
@@ -939,12 +1238,11 @@ int bar(int n){
   // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
   //
   // CHECK: [[DO_READ]]
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
+  // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
-  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
   // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
   // CHECK: br label {{%?}}[[READ_CONT:.+]]
   //
@@ -961,10 +1259,10 @@ int bar(int n){
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
   //
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+  // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
   // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
-  // CHECK: store i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
   // CHECK: br label {{%?}}[[COPY_CONT:.+]]
   //
   // CHECK: [[COPY_ELSE]]
@@ -981,12 +1279,12 @@ int bar(int n){
   // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
   //
   // CHECK: [[DO_READ]]
-  // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
-  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+  // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+  // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
   // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
   // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
   // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
-  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+  // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
   // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
   // CHECK: br label {{%?}}[[READ_CONT:.+]]
   //