NB_Parallel = 1,
};
+static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
+ RefExpr = RefExpr->IgnoreParens();
+ if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
+ const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
+ while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
+ Base = TempASE->getBase()->IgnoreParenImpCasts();
+ RefExpr = Base;
+ } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
+ const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
+ while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
+ Base = TempOASE->getBase()->IgnoreParenImpCasts();
+ while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
+ Base = TempASE->getBase()->IgnoreParenImpCasts();
+ RefExpr = Base;
+ }
+ RefExpr = RefExpr->IgnoreParenImpCasts();
+ if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
+ return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
+ const auto *ME = cast<MemberExpr>(RefExpr);
+ return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
+}
+
typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
return P1.first > P2.first;
}
public:
- CheckVarsEscapingDeclContext(CodeGenFunction &CGF) : CGF(CGF) {}
+ CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
+ ArrayRef<const ValueDecl *> TeamsReductions)
+ : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
+ }
virtual ~CheckVarsEscapingDeclContext() = default;
void VisitDeclStmt(const DeclStmt *S) {
if (!S)
/// Get barrier to synchronize all threads in a block.
static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
- CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
- &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
+ llvm::Function *F = llvm::Intrinsic::getDeclaration(
+ &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0);
+ F->addFnAttr(llvm::Attribute::Convergent);
+ CGF.EmitRuntimeCall(F);
}
/// Get barrier #ID to synchronize selected (multiple of warp size) threads in
llvm::Value *NumThreads) {
CGBuilderTy &Bld = CGF.Builder;
llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
- CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
- &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),
- Args);
+ llvm::Function *F = llvm::Intrinsic::getDeclaration(
+ &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier);
+ F->addFnAttr(llvm::Attribute::Convergent);
+ CGF.EmitRuntimeCall(F, Args);
}
/// Synchronize all GPU threads in a block.
if (!Dir)
return;
for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
- for (const Expr *E : C->getVarRefs()) {
- const auto *DE = cast<DeclRefExpr>(E->IgnoreParens());
- Vars.push_back(cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()));
- }
+ for (const Expr *E : C->getVarRefs())
+ Vars.push_back(getPrivateItem(E));
+ }
+}
+
+/// Get list of reduction variables from the teams ... directives.
+static void
+getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
+ llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
+ assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
+ "expected teams directive.");
+ for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
+ for (const Expr *E : C->privates())
+ Vars.push_back(getPrivateItem(E));
}
}
SourceLocation Loc = D.getBeginLoc();
const RecordDecl *GlobalizedRD = nullptr;
- llvm::SmallVector<const ValueDecl *, 4> LastPrivates;
+ llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
+ // Globalize team reductions variable unconditionally in all modes.
+ getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
- getDistributeLastprivateVars(CGM.getContext(), D, LastPrivates);
- if (!LastPrivates.empty())
+ getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
+ if (!LastPrivatesReductions.empty()) {
GlobalizedRD = ::buildRecordForGlobalizedVars(
- CGM.getContext(), llvm::None, LastPrivates, MappedDeclsFields);
+ CGM.getContext(), llvm::None, LastPrivatesReductions,
+ MappedDeclsFields);
+ }
+ } else if (!LastPrivatesReductions.empty()) {
+ assert(!TeamAndReductions.first &&
+ "Previous team declaration is not expected.");
+ TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
+ std::swap(TeamAndReductions.second, LastPrivatesReductions);
}
// Emit target region as a standalone region.
/*Volatile=*/false, Int16Ty, Loc);
auto *StaticGlobalized = new llvm::GlobalVariable(
CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
- llvm::GlobalValue::WeakAnyLinkage, nullptr);
+ llvm::GlobalValue::CommonLinkage, nullptr);
auto *RecSize = new llvm::GlobalVariable(
CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
llvm::GlobalValue::InternalLinkage, nullptr,
CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
IntType, Offset, Loc);
CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
- Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
- ElemPtr =
+ Address LocalPtr =
+ Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
+ Address LocalElemPtr =
Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
- PhiSrc->addIncoming(Ptr.getPointer(), ThenBB);
- PhiDest->addIncoming(ElemPtr.getPointer(), ThenBB);
+ PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
+ PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
CGF.EmitBranch(PreCondBB);
CGF.EmitBlock(ExitBB);
} else {
CGF.SizeTy, /*isSigned=*/true);
Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
- llvm::Value *WidthVal =
- Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
- Int32Ty, SourceLocation()),
- CGF.SizeTy, /*isSigned=*/true);
+ llvm::Value *WidthVal = Bld.CreateIntCast(
+ CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc),
+ CGF.SizeTy, /*isSigned=*/true);
// The absolute ptr address to the base addr of the next element to copy.
llvm::Value *CumulativeElemBasePtr =
llvm::GlobalVariable *TransferMedium =
M.getGlobalVariable(TransferMediumName);
if (!TransferMedium) {
- auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
+ auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
TransferMedium = new llvm::GlobalVariable(
- M, Ty,
- /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
+ M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
llvm::Constant::getNullValue(Ty), TransferMediumName,
/*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
SharedAddressSpace);
Address LocalReduceList(
Bld.CreatePointerBitCastOrAddrSpaceCast(
CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
- C.VoidPtrTy, SourceLocation()),
+ C.VoidPtrTy, Loc),
CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
CGF.getPointerAlign());
// Warp master copies reduce element to transfer medium in __shared__
// memory.
//
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
-
- // if (lane_id == 0)
- llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
- Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
- CGF.EmitBlock(ThenBB);
-
- // Reduce element = LocalReduceList[i]
- Address ElemPtrPtrAddr =
- Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
- llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
- ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
- // elemptr = (type[i]*)(elemptrptr)
- Address ElemPtr =
- Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
- ElemPtr = Bld.CreateElementBitCast(
- ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
-
- // Get pointer to location in transfer medium.
- // MediumPtr = &medium[warp_id]
- llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
- TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
- Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
- // Casting to actual data type.
- // MediumPtr = (type[i]*)MediumPtrAddr;
- MediumPtr = Bld.CreateElementBitCast(
- MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
-
- // elem = *elemptr
- //*MediumPtr = elem
- if (Private->getType()->isScalarType()) {
- llvm::Value *Elem = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
- Private->getType(), Loc);
- // Store the source element value to the dest element address.
- CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/false,
- Private->getType());
- } else {
- CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
- CGF.MakeAddrLValue(MediumPtr, Private->getType()),
- Private->getType(), AggValueSlot::DoesNotOverlap);
- }
-
- Bld.CreateBr(MergeBB);
-
- CGF.EmitBlock(ElseBB);
- Bld.CreateBr(MergeBB);
+ unsigned RealTySize =
+ C.getTypeSizeInChars(Private->getType())
+ .alignTo(C.getTypeAlignInChars(Private->getType()))
+ .getQuantity();
+ for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
+ unsigned NumIters = RealTySize / TySize;
+ if (NumIters == 0)
+ continue;
+ QualType CType = C.getIntTypeForBitwidth(
+ C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
+ llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
+ CharUnits Align = CharUnits::fromQuantity(TySize);
+ llvm::Value *Cnt = nullptr;
+ Address CntAddr = Address::invalid();
+ llvm::BasicBlock *PrecondBB = nullptr;
+ llvm::BasicBlock *ExitBB = nullptr;
+ if (NumIters > 1) {
+ CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
+ CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
+ /*Volatile=*/false, C.IntTy);
+ PrecondBB = CGF.createBasicBlock("precond");
+ ExitBB = CGF.createBasicBlock("exit");
+ llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
+ // There is no need to emit line number for unconditional branch.
+ (void)ApplyDebugLocation::CreateEmpty(CGF);
+ CGF.EmitBlock(PrecondBB);
+ Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
+ llvm::Value *Cmp =
+ Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
+ Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
+ CGF.EmitBlock(BodyBB);
+ }
+ llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
+ llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
+ llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
- CGF.EmitBlock(MergeBB);
+ // if (lane_id == 0)
+ llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
+ Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
+ CGF.EmitBlock(ThenBB);
- Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
- llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
- AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
+ // Reduce element = LocalReduceList[i]
+ Address ElemPtrPtrAddr =
+ Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
+ llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
+ ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
+ // elemptr = ((CopyType*)(elemptrptr)) + I
+ Address ElemPtr = Address(ElemPtrPtr, Align);
+ ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
+ if (NumIters > 1) {
+ ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
+ ElemPtr.getAlignment());
+ }
- llvm::Value *NumActiveThreads = Bld.CreateNSWMul(
- NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
- // named_barrier_sync(ParallelBarrierID, num_active_threads)
- syncParallelThreads(CGF, NumActiveThreads);
+ // Get pointer to location in transfer medium.
+ // MediumPtr = &medium[warp_id]
+ llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
+ TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
+ Address MediumPtr(MediumPtrVal, Align);
+ // Casting to actual data type.
+ // MediumPtr = (CopyType*)MediumPtrAddr;
+ MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
+
+ // elem = *elemptr
+ //*MediumPtr = elem
+ llvm::Value *Elem =
+ CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc);
+ // Store the source element value to the dest element address.
+ CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType);
+
+ Bld.CreateBr(MergeBB);
+
+ CGF.EmitBlock(ElseBB);
+ Bld.CreateBr(MergeBB);
+
+ CGF.EmitBlock(MergeBB);
+
+ Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
+ llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
+ AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
+
+ llvm::Value *NumActiveThreads = Bld.CreateNSWMul(
+ NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
+ // named_barrier_sync(ParallelBarrierID, num_active_threads)
+ syncParallelThreads(CGF, NumActiveThreads);
+
+ //
+ // Warp 0 copies reduce element from transfer medium.
+ //
+ llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
+ llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
+ llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
+
+ // Up to 32 threads in warp 0 are active.
+ llvm::Value *IsActiveThread =
+ Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
+ Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
+
+ CGF.EmitBlock(W0ThenBB);
+
+ // SrcMediumPtr = &medium[tid]
+ llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
+ TransferMedium,
+ {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
+ Address SrcMediumPtr(SrcMediumPtrVal, Align);
+ // SrcMediumVal = *SrcMediumPtr;
+ SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
+
+ // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
+ Address TargetElemPtrPtr =
+ Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
+ llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
+ TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
+ Address TargetElemPtr = Address(TargetElemPtrVal, Align);
+ TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
+ if (NumIters > 1) {
+ TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
+ TargetElemPtr.getAlignment());
+ }
- //
- // Warp 0 copies reduce element from transfer medium.
- //
- llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
-
- // Up to 32 threads in warp 0 are active.
- llvm::Value *IsActiveThread =
- Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
- Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
-
- CGF.EmitBlock(W0ThenBB);
-
- // SrcMediumPtr = &medium[tid]
- llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
- TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
- Address SrcMediumPtr(SrcMediumPtrVal,
- C.getTypeAlignInChars(Private->getType()));
- // SrcMediumVal = *SrcMediumPtr;
- SrcMediumPtr = Bld.CreateElementBitCast(
- SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
-
- // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
- Address TargetElemPtrPtr =
- Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
- llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
- TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
- Address TargetElemPtr =
- Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
- TargetElemPtr = Bld.CreateElementBitCast(
- TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
-
- // *TargetElemPtr = SrcMediumVal;
- if (Private->getType()->isScalarType()) {
- llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
- SrcMediumPtr, /*Volatile=*/false, Private->getType(), Loc);
+ // *TargetElemPtr = SrcMediumVal;
+ llvm::Value *SrcMediumValue =
+ CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
- Private->getType());
- } else {
- CGF.EmitAggregateCopy(
- CGF.MakeAddrLValue(SrcMediumPtr, Private->getType()),
- CGF.MakeAddrLValue(TargetElemPtr, Private->getType()),
- Private->getType(), AggValueSlot::DoesNotOverlap);
- }
- Bld.CreateBr(W0MergeBB);
+ CType);
+ Bld.CreateBr(W0MergeBB);
- CGF.EmitBlock(W0ElseBB);
- Bld.CreateBr(W0MergeBB);
+ CGF.EmitBlock(W0ElseBB);
+ Bld.CreateBr(W0MergeBB);
- CGF.EmitBlock(W0MergeBB);
+ CGF.EmitBlock(W0MergeBB);
- // While warp 0 copies values from transfer medium, all other warps must
- // wait.
- syncParallelThreads(CGF, NumActiveThreads);
+ // While warp 0 copies values from transfer medium, all other warps must
+ // wait.
+ syncParallelThreads(CGF, NumActiveThreads);
+ if (NumIters > 1) {
+ Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
+ CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
+ CGF.EmitBranch(PrecondBB);
+ (void)ApplyDebugLocation::CreateEmpty(CGF);
+ CGF.EmitBlock(ExitBB);
+ }
+ RealTySize %= TySize;
+ }
++Idx;
}
bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
- bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind);
- assert((TeamsReduction || ParallelReduction || SimdReduction) &&
- "Invalid reduction selection in emitReduction.");
if (Options.SimpleReduction) {
+ assert(!TeamsReduction && !ParallelReduction &&
+ "Invalid reduction selection in emitReduction.");
CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
ReductionOps, Options);
return;
}
+ assert((TeamsReduction || ParallelReduction) &&
+ "Invalid reduction selection in emitReduction.");
ASTContext &C = CGM.getContext();
// 1. Build a list of reduction variables.
llvm::Value *InterWarpCopyFn =
emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
- llvm::Value *Args[] = {ThreadId,
- CGF.Builder.getInt32(RHSExprs.size()),
- ReductionArrayTySize,
- RL,
- ShuffleAndReduceFn,
- InterWarpCopyFn};
+ llvm::Value *Res;
+ if (ParallelReduction) {
+ llvm::Value *Args[] = {ThreadId,
+ CGF.Builder.getInt32(RHSExprs.size()),
+ ReductionArrayTySize,
+ RL,
+ ShuffleAndReduceFn,
+ InterWarpCopyFn};
- llvm::Value *Res = nullptr;
- if (ParallelReduction)
Res = CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
Args);
- else if (SimdReduction)
- Res = CGF.EmitRuntimeCall(
- createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_simd_reduce_nowait),
- Args);
-
- if (TeamsReduction) {
+ } else {
+ assert(TeamsReduction && "expected teams reduction.");
llvm::Value *ScratchPadCopyFn =
emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc);
llvm::Value *LoadAndReduceFn = emitReduceScratchpadFunction(
Args);
}
- // 5. Build switch(res)
- llvm::BasicBlock *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
- llvm::SwitchInst *SwInst =
- CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
+ // 5. Build if (res == 1)
+ llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
+ llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
+ llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
+ Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
+ CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
- // 6. Build case 1: where we have reduced values in the master
+ // 6. Build then branch: where we have reduced values in the master
// thread in each team.
// __kmpc_end_reduce{_nowait}(<gtid>);
// break;
- llvm::BasicBlock *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
- SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
- CGF.EmitBlock(Case1BB);
+ CGF.EmitBlock(ThenBB);
// Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
llvm::Value *EndArgs[] = {ThreadId};
EndArgs);
RCG.setAction(Action);
RCG(CGF);
- CGF.EmitBranch(DefaultBB);
- CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
+ // There is no need to emit line number for unconditional branch.
+ (void)ApplyDebugLocation::CreateEmpty(CGF);
+ CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
}
const VarDecl *
assert(D && "Expected function or captured|block decl.");
assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
"Function is registered already.");
+ assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
+ "Team is set but not processed.");
const Stmt *Body = nullptr;
bool NeedToDelayGlobalization = false;
if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
}
if (!Body)
return;
- CheckVarsEscapingDeclContext VarChecker(CGF);
+ CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
VarChecker.Visit(Body);
const RecordDecl *GlobalizedVarsRecord =
VarChecker.getGlobalizedRecord(IsInTTDRegion);
+ TeamAndReductions.first = nullptr;
+ TeamAndReductions.second.clear();
ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
VarChecker.getEscapedVariableLengthDecls();
if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
}
if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
- CheckVarsEscapingDeclContext VarChecker(CGF);
+ CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
VarChecker.Visit(Body);
I->getSecond().SecondaryGlobalRecord =
VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
auto *GV = new llvm::GlobalVariable(
CGM.getModule(), LLVMStaticTy,
- /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
+ /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
llvm::Constant::getNullValue(LLVMStaticTy),
"_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
llvm::GlobalValue::NotThreadLocal,
llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
auto *GV = new llvm::GlobalVariable(
CGM.getModule(), LLVMArr2Ty,
- /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
+ /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
llvm::Constant::getNullValue(LLVMArr2Ty),
"_openmp_static_glob_rd_$_");
auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
/// Shared pointer for the global memory in the global memory buffer used for
/// the given kernel.
llvm::GlobalVariable *KernelStaticGlobalized = nullptr;
+ /// Pair of the Non-SPMD team and all reductions variables in this team
+ /// region.
+ std::pair<const Decl *, llvm::SmallVector<const ValueDecl *, 4>>
+ TeamAndReductions;
};
} // CodeGen namespace.
}
}
// CK1: [[MEM_TY:%.+]] = type { [8 x i8] }
-// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CK1-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CK1-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i64 8
// CK1-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
}
// CHECK: [[MEM_TY:%.+]] = type { [84 x i8] }
-// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 84
// CHECK-DAG: @__omp_offloading_{{.*}}_main_l17_exec_mode = weak constant i8 1
}
// CHECK: [[MEM_TY:%.+]] = type { [4 x i8] }
-// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
}
// CHECK: [[MEM_TY:%.+]] = type { [4 x i8] }
-// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
#define HEADER
// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
-// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64]
+// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
// Check that the execution mode of all 3 target regions is set to Spmd Mode.
// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 0
// CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align
// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
- // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [
- // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]]
+ // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
+ // CHECK: br i1 [[CMP]], label
- // CHECK: [[REDLABEL]]
// CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
// CHECK: [[EV:%.+]] = load double, double* [[E]], align
// CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
// CHECK: store double [[ADD]], double* [[E_IN]], align
// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
- // CHECK: br label %[[DEFAULTLABEL]]
+ // CHECK: br label
//
- // CHECK: [[DEFAULTLABEL]]
// CHECK: ret
//
// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
+ // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
+ // CHECK: br label
+ // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
+ // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
+ // CHECK: br i1 [[DONE_COPY]], label
// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
//
// [[DO_COPY]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
+ // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
- // CHECK: store double [[ELT_VAL]], double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
+ // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
- // CHECK: store double [[MEDIUM_ELT_VAL]], double* [[ELT]], align
+ // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
+ // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[READ_ELSE]]
//
// CHECK: [[READ_CONT]]
// CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
+ // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
+ // CHECK: br label
// CHECK: ret
// CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align
// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
- // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [
- // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]]
-
- // CHECK: [[REDLABEL]]
+ // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
+ // CHECK: br i1 [[CMP]], label
// CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align
// CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32
// CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align
// CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]]
// CHECK: store float [[MUL]], float* [[D_IN]], align
// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
- // CHECK: br label %[[DEFAULTLABEL]]
+ // CHECK: br label
//
- // CHECK: [[DEFAULTLABEL]]
// CHECK: ret
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
// CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
- // CHECK: store i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// [[DO_COPY]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
- // CHECK: store float [[ELT_VAL]], float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+ // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
- // CHECK: store float [[MEDIUM_ELT_VAL]], float* [[ELT]], align
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[READ_ELSE]]
// CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
- // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [
- // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]]
+ // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1
+ // CHECK: br i1 [[CMP]], label
- // CHECK: [[REDLABEL]]
// CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
// CHECK: [[AV:%.+]] = load i32, i32* [[A]], align
// CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
// CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
// CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
- // CHECK: br label %[[DEFAULTLABEL]]
+ // CHECK: br label
//
- // CHECK: [[DEFAULTLABEL]]
// CHECK: ret
//
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
- // CHECK: store i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
// CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
- // CHECK: store i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
}
// CHECK-DAG: [[MEM_TY:%.+]] = type { [4 x i8] }
-// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
}
// CHECK-DAG: [[MEM_TY:%.+]] = type { [4 x i8] }
-// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
}
// CK1: [[MEM_TY:%.+]] = type { [{{4|8}} x i8] }
-// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CK1-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CK1-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4
// CK1-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}}
}
// CK2: [[MEM_TY:%.+]] = type { [{{4|8}} x i8] }
-// CK2-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CK2-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CK2-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CK2-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4
// CK2-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}}
#ifndef HEADER
#define HEADER
+// CHECK: [[MAP_TY:%.+]] = type { [16 x i8] }
+
+// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
+// CHECK-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1
+// CHECK-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1
+// CHECK-DAG: [[KERNEL_SHARED3:@.+]] = internal unnamed_addr constant i16 1
+// CHECK-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} {{16|8}}
+// CHECK-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} 16
+// CHECK-DAG: [[KERNEL_SIZE3:@.+]] = internal unnamed_addr constant i{{64|32}} 8
+
// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
-// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64]
+// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
-// Check that the execution mode of all 3 target regions is set to Generic Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 1
+// Check that the execution mode of 2 target regions is set to Non-SPMD and the 3rd is in SPMD.
+// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0
template<typename tx>
tx ftemplate(int n) {
#pragma omp target
#pragma omp teams reduction(|: a) reduction(max: b)
+ #pragma omp parallel reduction(|: a) reduction(max: b)
{
a |= 1;
b = 99 > b ? 99 : b;
return a;
}
- // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l27}}_worker()
+ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l37}}_worker()
- // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l27]](
+ // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l37]](
//
// CHECK: {{call|invoke}} void [[T1]]_worker()
//
// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
+ // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]],
+ // CHECK: br label
+ // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
+ // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
+ // CHECK: br i1 [[DONE_COPY]], label
// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
//
// [[DO_COPY]]
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
+ // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]]
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align
- // CHECK: store double [[ELT_VAL]], double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]],
+ // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
- // CHECK: store double [[MEDIUM_ELT_VAL]], double* [[ELT]], align
+ // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]]
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]],
+ // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]],
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[READ_ELSE]]
//
// CHECK: [[READ_CONT]]
// CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1
+ // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]],
+ // CHECK: br label
// CHECK: ret
//
// CHECK: [[REDUCE_CONT]]
// CHECK: ret
- // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}_worker()
+ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker()
- // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l33]](
+ // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l43]](
//
// CHECK: {{call|invoke}} void [[T2]]_worker()
//
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
// CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
- // CHECK: store i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])*
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// [[DO_COPY]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
- // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align
- // CHECK: store float [[ELT_VAL]], float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+ // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
- // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
- // CHECK: store float [[MEDIUM_ELT_VAL]], float* [[ELT]], align
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[READ_ELSE]]
// CHECK: [[REDUCE_CONT]]
// CHECK: ret
- // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l40}}_worker()
-
- // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+template.+l40]](
+ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l50}}(
+ //
+ // CHECK: call void @__kmpc_spmd_kernel_init(
+ // CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
+ // CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY:%.+]], %{{.+}} addrspace(3)* [[KERNEL_RD:@.+]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} {{8|16}}, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR:@.+]] to i8**))
+ // CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
+ // CHECK: [[GLOBAL_REC:%.+]] = bitcast i8* [[PTR]] to [[GLOB_REC_TY:%.+]]*
+ // CHECK-DAG: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOB_REC_TY]], [[GLOB_REC_TY]]* [[GLOBAL_REC]], i32 0, i32 0
+ // CHECK-DAG: [[B_ADDR:%.+]] = getelementptr inbounds [[GLOB_REC_TY]], [[GLOB_REC_TY]]* [[GLOBAL_REC]], i32 0, i32 1
+ // CHECK: store i32 0, i32* [[A_ADDR]],
+ // CHECK: store i16 -32768, i16* [[B_ADDR]],
+ // CHECK: call void [[OUTLINED:@.+]](i32* {{.+}}, i32* {{.+}}, i32* [[A_ADDR]], i16* [[B_ADDR]])
+ // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A_ADDR]] to i8*
+ // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align
+ // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B_ADDR]] to i8*
+ // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
+ // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
+ // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]])
+ // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
+ // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
//
- // CHECK: {{call|invoke}} void [[T3]]_worker()
+ // CHECK: [[IFLABEL]]
+ // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align
+ // CHECK: [[AV:%.+]] = load i32, i32* [[A_ADDR]], align
+ // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]]
+ // CHECK: store i32 [[OR]], i32* [[A_IN]], align
+ // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align
+ // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32
+ // CHECK: [[BV16:%.+]] = load i16, i16* [[B_ADDR]], align
+ // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32
+ // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]]
+ // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
//
- // CHECK: call void @__kmpc_kernel_init(
+ // CHECK: [[DO_MAX]]
+ // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align
+ // CHECK: br label {{%?}}[[MAX_CONT:.+]]
+ //
+ // CHECK: [[MAX_ELSE]]
+ // CHECK: [[MAX2:%.+]] = load i16, i16* [[B_ADDR]], align
+ // CHECK: br label {{%?}}[[MAX_CONT]]
+ //
+ // CHECK: [[MAX_CONT]]
+ // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
+ // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align
+ // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
+ // CHECK: br label %[[EXIT]]
+ //
+ // CHECK: [[EXIT]]
+ // call void @__kmpc_restore_team_static_memory(i16 1)
+ // CHECK: call void @__kmpc_spmd_kernel_deinit(
+
+ // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.+}}, i16* dereferenceable{{.+}})
//
// CHECK: store i32 0, i32* [[A:%.+]], align
// CHECK: store i16 -32768, i16* [[B:%.+]], align
// CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8*
// CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align
// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
- // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]])
+ // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[PAR_SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[PAR_WARP_COPY_FN:@.+]])
// CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1
// CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]]
//
// CHECK: br label %[[EXIT]]
//
// CHECK: [[EXIT]]
- // CHECK: call void @__kmpc_kernel_deinit(
+ // CHECK: ret void
+
+ //
+ // Reduction function
+ // CHECK: define internal void [[PAR_REDUCTION_FUNC:@.+]](i8*, i8*)
+ // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
+ // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32*
+ //
+ // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
+ // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32*
+ //
+ // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]],
+ // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16*
+ //
+ // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]],
+ // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16*
+ //
+ // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]],
+ // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]],
+ // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]]
+ // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]],
+ //
+ // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]],
+ // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32
+ // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]],
+ // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32
+ //
+ // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]]
+ // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]]
+ //
+ // CHECK: [[DO_MAX]]
+ // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align
+ // CHECK: br label {{%?}}[[MAX_CONT:.+]]
+ //
+ // CHECK: [[MAX_ELSE]]
+ // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align
+ // CHECK: br label {{%?}}[[MAX_CONT]]
+ //
+ // CHECK: [[MAX_CONT]]
+ // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ]
+ // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]],
+ // CHECK: ret void
+
+ //
+ // Shuffle and reduce function
+ // CHECK: define internal void [[PAR_SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}})
+ // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align
+ // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32
+ // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16
+ //
+ // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align
+ // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align
+ // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align
+ //
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+ //
+ // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+ // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
+ // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
+ //
+ // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align
+ // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8*
+ // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align
+ //
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+ // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
+ //
+ // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
+ // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+ // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
+ // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
+ // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
+ //
+ // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align
+ // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8*
+ // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align
+ //
+ // Condition to reduce
+ // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0
+ //
+ // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+ // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]]
+ // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]]
+ //
+ // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2
+ // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1
+ // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0
+ // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]]
+ // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0
+ // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]]
+ //
+ // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]]
+ // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]]
+ // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]]
+ //
+ // CHECK: [[DO_REDUCE]]
+ // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8*
+ // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8*
+ // CHECK: call void [[PAR_REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]])
+ // CHECK: br label {{%?}}[[REDUCE_CONT:.+]]
+ //
+ // CHECK: [[REDUCE_ELSE]]
+ // CHECK: br label {{%?}}[[REDUCE_CONT]]
+ //
+ // CHECK: [[REDUCE_CONT]]
+ // Now check if we should just copy over the remote reduction list
+ // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1
+ // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]]
+ // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]]
+ // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+ //
+ // CHECK: [[DO_COPY]]
+ // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32*
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align
+ // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align
+ //
+ // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]],
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16*
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+ // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align
+ // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align
+ // CHECK: br label {{%?}}[[COPY_CONT:.+]]
+ //
+ // CHECK: [[COPY_ELSE]]
+ // CHECK: br label {{%?}}[[COPY_CONT]]
+ //
+ // CHECK: [[COPY_CONT]]
+ // CHECK: void
+
+ //
+ // Inter warp copy function
+ // CHECK: define internal void [[PAR_WARP_COPY_FN]](i8*, i32)
+ // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
+ // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
+ // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
+ // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
+ // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+ //
+ // [[DO_COPY]]
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ //
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
+ // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: br label {{%?}}[[COPY_CONT:.+]]
+ //
+ // CHECK: [[COPY_ELSE]]
+ // CHECK: br label {{%?}}[[COPY_CONT]]
+ //
+ // Barrier after copy to shared memory storage medium.
+ // CHECK: [[COPY_CONT]]
+ // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+ // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
+ // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ //
+ // Read into warp 0.
+ // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
+ // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
+ //
+ // CHECK: [[DO_READ]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
+ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
+ // CHECK: br label {{%?}}[[READ_CONT:.+]]
+ //
+ // CHECK: [[READ_ELSE]]
+ // CHECK: br label {{%?}}[[READ_CONT]]
+ //
+ // CHECK: [[READ_CONT]]
+ // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
+ // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
+ //
+ // [[DO_COPY]]
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+ //
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
+ // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: br label {{%?}}[[COPY_CONT:.+]]
+ //
+ // CHECK: [[COPY_ELSE]]
+ // CHECK: br label {{%?}}[[COPY_CONT]]
+ //
+ // Barrier after copy to shared memory storage medium.
+ // CHECK: [[COPY_CONT]]
+ // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+ // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]]
+ // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ //
+ // Read into warp 0.
+ // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]]
+ // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
+ //
+ // CHECK: [[DO_READ]]
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
+ // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
+ // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
+ // CHECK: br label {{%?}}[[READ_CONT:.+]]
+ //
+ // CHECK: [[READ_ELSE]]
+ // CHECK: br label {{%?}}[[READ_CONT]]
+ //
+ // CHECK: [[READ_CONT]]
+ // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]])
+ // CHECK: ret
//
// Reduction function
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
- // CHECK: store i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
//
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
// CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
- // CHECK: store i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: br label {{%?}}[[COPY_CONT:.+]]
//
// CHECK: [[COPY_ELSE]]
// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]]
//
// CHECK: [[DO_READ]]
- // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
- // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])*
+ // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]]
+ // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])*
// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1
// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]],
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16*
- // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
+ // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align
// CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align
// CHECK: br label {{%?}}[[READ_CONT:.+]]
//