From: Alexey Bataev Date: Wed, 20 Feb 2019 16:36:22 +0000 (+0000) Subject: [OPENMP][NVPTX]Use faster teams reduction algorithm. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=349d010d880cff6877e83050c66dda47822ef7c4;p=clang [OPENMP][NVPTX]Use faster teams reduction algorithm. A faster way to reduce the values in teams reductions was found, the codegen is updated to use this faster algorithm and new runtime functions. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@354479 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/clang/Basic/LangOptions.def b/include/clang/Basic/LangOptions.def index 905089e553..9df5550f80 100644 --- a/include/clang/Basic/LangOptions.def +++ b/include/clang/Basic/LangOptions.def @@ -207,6 +207,7 @@ LANGOPT(OpenMPCUDAMode , 1, 0, "Generate code for OpenMP pragmas in SIMT/SPMD LANGOPT(OpenMPCUDAForceFullRuntime , 1, 0, "Force to use full runtime in all constructs when offloading to CUDA devices") LANGOPT(OpenMPCUDANumSMs , 32, 0, "Number of SMs for CUDA devices.") LANGOPT(OpenMPCUDABlocksPerSM , 32, 0, "Number of blocks per SM for CUDA devices.") +LANGOPT(OpenMPCUDAReductionBufNum , 32, 1024, "Number of the reduction records in the intermediate reduction buffer used for the teams reductions.") LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.") LANGOPT(RenderScript , 1, 0, "RenderScript") diff --git a/include/clang/Driver/Options.td b/include/clang/Driver/Options.td index e3e9f90c6f..3b947d5496 100644 --- a/include/clang/Driver/Options.td +++ b/include/clang/Driver/Options.td @@ -1573,6 +1573,8 @@ def fopenmp_cuda_number_of_sm_EQ : Joined<["-"], "fopenmp-cuda-number-of-sm=">, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; def fopenmp_cuda_blocks_per_sm_EQ : Joined<["-"], "fopenmp-cuda-blocks-per-sm=">, Group, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; +def fopenmp_cuda_teams_reduction_recs_num_EQ : Joined<["-"], "fopenmp-cuda-teams-reduction-recs-num=">, Group, + Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; def fopenmp_optimistic_collapse : Flag<["-"], "fopenmp-optimistic-collapse">, Group, Flags<[CC1Option, NoArgumentUnused, HelpHidden]>; def fno_openmp_optimistic_collapse : Flag<["-"], "fno-openmp-optimistic-collapse">, Group, diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index f66943a701..17ee48fe01 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -60,13 +60,19 @@ enum OpenMPRTLFunctionNVPTX { /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t /// lane_offset, int16_t shortCircuit), /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num)); - OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2, - /// Call to __kmpc_nvptx_teams_reduce_nowait_simple(ident_t *loc, kmp_int32 - /// global_tid, kmp_critical_name *lck) - OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple, - /// Call to __kmpc_nvptx_teams_end_reduce_nowait_simple(ident_t *loc, - /// kmp_int32 global_tid, kmp_critical_name *lck) - OMPRTL_NVPTX__kmpc_nvptx_teams_end_reduce_nowait_simple, + OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2, + /// Call to __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32 + /// global_tid, void *global_buffer, int32_t num_of_records, void* + /// reduce_data, + /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t + /// lane_offset, int16_t shortCircuit), + /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void + /// (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data), + /// void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx, + /// void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer, + /// int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void + /// *buffer, int idx, void *reduce_data)); + OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2, /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); OMPRTL_NVPTX__kmpc_end_reduce_nowait, /// Call to void __kmpc_data_sharing_init_stack(); @@ -224,7 +230,7 @@ static RecordDecl *buildRecordForGlobalizedVars( ASTContext &C, ArrayRef EscapedDecls, ArrayRef EscapedDeclsForTeams, llvm::SmallDenseMap - &MappedDeclsFields) { + &MappedDeclsFields, int BufSize) { if (EscapedDecls.empty() && EscapedDeclsForTeams.empty()) return nullptr; SmallVector GlobalizedVars; @@ -270,7 +276,7 @@ static RecordDecl *buildRecordForGlobalizedVars( Field->addAttr(*I); } } else { - llvm::APInt ArraySize(32, WarpSize); + llvm::APInt ArraySize(32, BufSize); Type = C.getConstantArrayType(Type, ArraySize, ArrayType::Normal, 0); Field = FieldDecl::Create( C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type, @@ -419,7 +425,7 @@ class CheckVarsEscapingDeclContext final EscapedDeclsForParallel = EscapedDecls.getArrayRef(); GlobalizedRD = ::buildRecordForGlobalizedVars( CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams, - MappedDeclsFields); + MappedDeclsFields, WarpSize); } public: @@ -1651,7 +1657,7 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64"); break; } - case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2: { + case OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2: { // Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, // kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void* // reduce_data, void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t @@ -1688,28 +1694,47 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait"); break; } - case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple: { - // Build __kmpc_nvptx_teams_reduce_nowait_simple(ident_t *loc, kmp_int32 - // global_tid, kmp_critical_name *lck) - llvm::Type *TypeParams[] = { - getIdentTyPointerTy(), CGM.Int32Ty, - llvm::PointerType::getUnqual(getKmpCriticalNameTy())}; + case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2: { + // Build int32_t __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32 + // global_tid, void *global_buffer, int32_t num_of_records, void* + // reduce_data, + // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t + // lane_offset, int16_t shortCircuit), + // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void + // (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data), + // void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx, + // void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer, + // int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void + // *buffer, int idx, void *reduce_data)); + llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty, + CGM.Int16Ty, CGM.Int16Ty}; + auto *ShuffleReduceFnTy = + llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams, + /*isVarArg=*/false); + llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty}; + auto *InterWarpCopyFnTy = + llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams, + /*isVarArg=*/false); + llvm::Type *GlobalListTypeParams[] = {CGM.VoidPtrTy, CGM.IntTy, + CGM.VoidPtrTy}; + auto *GlobalListFnTy = + llvm::FunctionType::get(CGM.VoidTy, GlobalListTypeParams, + /*isVarArg=*/false); + llvm::Type *TypeParams[] = {getIdentTyPointerTy(), + CGM.Int32Ty, + CGM.VoidPtrTy, + CGM.Int32Ty, + CGM.VoidPtrTy, + ShuffleReduceFnTy->getPointerTo(), + InterWarpCopyFnTy->getPointerTo(), + GlobalListFnTy->getPointerTo(), + GlobalListFnTy->getPointerTo(), + GlobalListFnTy->getPointerTo(), + GlobalListFnTy->getPointerTo()}; auto *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction( - FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait_simple"); - break; - } - case OMPRTL_NVPTX__kmpc_nvptx_teams_end_reduce_nowait_simple: { - // Build __kmpc_nvptx_teams_end_reduce_nowait_simple(ident_t *loc, kmp_int32 - // global_tid, kmp_critical_name *lck) - llvm::Type *TypeParams[] = { - getIdentTyPointerTy(), CGM.Int32Ty, - llvm::PointerType::getUnqual(getKmpCriticalNameTy())}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); - RTLFn = CGM.CreateRuntimeFunction( - FnTy, /*Name=*/"__kmpc_nvptx_teams_end_reduce_nowait_simple"); + FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait_v2"); break; } case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: { @@ -2020,13 +2045,14 @@ llvm::Function *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( llvm::SmallVector LastPrivatesReductions; llvm::SmallDenseMap MappedDeclsFields; // Globalize team reductions variable unconditionally in all modes. - getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); + if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD) + getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions); if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) { getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions); if (!LastPrivatesReductions.empty()) { GlobalizedRD = ::buildRecordForGlobalizedVars( CGM.getContext(), llvm::None, LastPrivatesReductions, - MappedDeclsFields); + MappedDeclsFields, WarpSize); } } else if (!LastPrivatesReductions.empty()) { assert(!TeamAndReductions.first && @@ -3046,18 +3072,31 @@ static void emitReductionListCopy( shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(), RemoteLaneOffset, Private->getExprLoc()); } else { - if (Private->getType()->isScalarType()) { + switch (CGF.getEvaluationKind(Private->getType())) { + case TEK_Scalar: { llvm::Value *Elem = CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false, Private->getType(), Private->getExprLoc()); // Store the source element value to the dest element address. CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false, Private->getType()); - } else { + break; + } + case TEK_Complex: { + CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex( + CGF.MakeAddrLValue(SrcElementAddr, Private->getType()), + Private->getExprLoc()); + CGF.EmitStoreOfComplex( + Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()), + /*isInit=*/false); + break; + } + case TEK_Aggregate: CGF.EmitAggregateCopy( CGF.MakeAddrLValue(DestElementAddr, Private->getType()), CGF.MakeAddrLValue(SrcElementAddr, Private->getType()), Private->getType(), AggValueSlot::DoesNotOverlap); + break; } } @@ -3141,9 +3180,9 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, const CGFunctionInfo &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); - auto *Fn = llvm::Function::Create( - CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, - "_omp_reduction_inter_warp_copy_func", &CGM.getModule()); + auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI), + llvm::GlobalValue::InternalLinkage, + "_omp_reduction_inter_warp_copy_func", &M); CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); Fn->setDoesNotRecurse(); CodeGenFunction CGF(CGM); @@ -3560,6 +3599,406 @@ static llvm::Function *emitShuffleAndReduceFunction( return Fn; } +/// This function emits a helper that copies all the reduction variables from +/// the team into the provided global buffer for the reduction variables. +/// +/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) +/// For all data entries D in reduce_data: +/// Copy local D to buffer.D[Idx] +static llvm::Value *emitListToGlobalCopyFunction( + CodeGenModule &CGM, ArrayRef Privates, + QualType ReductionArrayTy, SourceLocation Loc, + const RecordDecl *TeamReductionRec, + const llvm::SmallDenseMap + &VarFieldMap) { + ASTContext &C = CGM.getContext(); + + // Buffer: global reduction buffer. + ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.VoidPtrTy, ImplicitParamDecl::Other); + // Idx: index of the buffer. + ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, + ImplicitParamDecl::Other); + // ReduceList: thread local Reduce list. + ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.VoidPtrTy, ImplicitParamDecl::Other); + FunctionArgList Args; + Args.push_back(&BufferArg); + Args.push_back(&IdxArg); + Args.push_back(&ReduceListArg); + + const CGFunctionInfo &CGFI = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + "_omp_reduction_list_to_global_copy_func", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); + Fn->setDoesNotRecurse(); + CodeGenFunction CGF(CGM); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); + + CGBuilderTy &Bld = CGF.Builder; + + Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); + Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); + Address LocalReduceList( + Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, + C.VoidPtrTy, Loc), + CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), + CGF.getPointerAlign()); + QualType StaticTy = C.getRecordType(TeamReductionRec); + llvm::Type *LLVMReductionsBufferTy = + CGM.getTypes().ConvertTypeForMem(StaticTy); + llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), + LLVMReductionsBufferTy->getPointerTo()); + llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), + CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), + /*Volatile=*/false, C.IntTy, + Loc)}; + unsigned Idx = 0; + for (const Expr *Private : Privates) { + // Reduce element = LocalReduceList[i] + Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); + llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( + ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + // elemptr = ((CopyType*)(elemptrptr)) + I + ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo()); + Address ElemPtr = + Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType())); + const ValueDecl *VD = cast(Private)->getDecl(); + // Global = Buffer.VD[Idx]; + const FieldDecl *FD = VarFieldMap.lookup(VD); + LValue GlobLVal = CGF.EmitLValueForField( + CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); + llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs); + GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment())); + switch (CGF.getEvaluationKind(Private->getType())) { + case TEK_Scalar: { + llvm::Value *V = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, + Private->getType(), Loc); + CGF.EmitStoreOfScalar(V, GlobLVal); + break; + } + case TEK_Complex: { + CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex( + CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc); + CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false); + break; + } + case TEK_Aggregate: + CGF.EmitAggregateCopy(GlobLVal, + CGF.MakeAddrLValue(ElemPtr, Private->getType()), + Private->getType(), AggValueSlot::DoesNotOverlap); + break; + } + ++Idx; + } + + CGF.FinishFunction(); + return Fn; +} + +/// This function emits a helper that reduces all the reduction variables from +/// the team into the provided global buffer for the reduction variables. +/// +/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data) +/// void *GlobPtrs[]; +/// GlobPtrs[0] = (void*)&buffer.D0[Idx]; +/// ... +/// GlobPtrs[N] = (void*)&buffer.DN[Idx]; +/// reduce_function(GlobPtrs, reduce_data); +static llvm::Value *emitListToGlobalReduceFunction( + CodeGenModule &CGM, ArrayRef Privates, + QualType ReductionArrayTy, SourceLocation Loc, + const RecordDecl *TeamReductionRec, + const llvm::SmallDenseMap + &VarFieldMap, + llvm::Function *ReduceFn) { + ASTContext &C = CGM.getContext(); + + // Buffer: global reduction buffer. + ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.VoidPtrTy, ImplicitParamDecl::Other); + // Idx: index of the buffer. + ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, + ImplicitParamDecl::Other); + // ReduceList: thread local Reduce list. + ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.VoidPtrTy, ImplicitParamDecl::Other); + FunctionArgList Args; + Args.push_back(&BufferArg); + Args.push_back(&IdxArg); + Args.push_back(&ReduceListArg); + + const CGFunctionInfo &CGFI = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + "_omp_reduction_list_to_global_reduce_func", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); + Fn->setDoesNotRecurse(); + CodeGenFunction CGF(CGM); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); + + CGBuilderTy &Bld = CGF.Builder; + + Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); + QualType StaticTy = C.getRecordType(TeamReductionRec); + llvm::Type *LLVMReductionsBufferTy = + CGM.getTypes().ConvertTypeForMem(StaticTy); + llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), + LLVMReductionsBufferTy->getPointerTo()); + + // 1. Build a list of reduction variables. + // void *RedList[] = {[0], ..., [-1]}; + Address ReductionList = + CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); + auto IPriv = Privates.begin(); + llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), + CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), + /*Volatile=*/false, C.IntTy, + Loc)}; + unsigned Idx = 0; + for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) { + Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); + // Global = Buffer.VD[Idx]; + const ValueDecl *VD = cast(*IPriv)->getDecl(); + const FieldDecl *FD = VarFieldMap.lookup(VD); + LValue GlobLVal = CGF.EmitLValueForField( + CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); + llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs); + llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr); + CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy); + if ((*IPriv)->getType()->isVariablyModifiedType()) { + // Store array size. + ++Idx; + Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); + llvm::Value *Size = CGF.Builder.CreateIntCast( + CGF.getVLASize( + CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) + .NumElts, + CGF.SizeTy, /*isSigned=*/false); + CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), + Elem); + } + } + + // Call reduce_function(GlobalReduceList, ReduceList) + llvm::Value *GlobalReduceList = + CGF.EmitCastToVoidPtr(ReductionList.getPointer()); + Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); + llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar( + AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc); + CGM.getOpenMPRuntime().emitOutlinedFunctionCall( + CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr}); + CGF.FinishFunction(); + return Fn; +} + +/// This function emits a helper that copies all the reduction variables from +/// the team into the provided global buffer for the reduction variables. +/// +/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data) +/// For all data entries D in reduce_data: +/// Copy buffer.D[Idx] to local D; +static llvm::Value *emitGlobalToListCopyFunction( + CodeGenModule &CGM, ArrayRef Privates, + QualType ReductionArrayTy, SourceLocation Loc, + const RecordDecl *TeamReductionRec, + const llvm::SmallDenseMap + &VarFieldMap) { + ASTContext &C = CGM.getContext(); + + // Buffer: global reduction buffer. + ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.VoidPtrTy, ImplicitParamDecl::Other); + // Idx: index of the buffer. + ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, + ImplicitParamDecl::Other); + // ReduceList: thread local Reduce list. + ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.VoidPtrTy, ImplicitParamDecl::Other); + FunctionArgList Args; + Args.push_back(&BufferArg); + Args.push_back(&IdxArg); + Args.push_back(&ReduceListArg); + + const CGFunctionInfo &CGFI = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + "_omp_reduction_global_to_list_copy_func", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); + Fn->setDoesNotRecurse(); + CodeGenFunction CGF(CGM); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); + + CGBuilderTy &Bld = CGF.Builder; + + Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); + Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); + Address LocalReduceList( + Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, + C.VoidPtrTy, Loc), + CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), + CGF.getPointerAlign()); + QualType StaticTy = C.getRecordType(TeamReductionRec); + llvm::Type *LLVMReductionsBufferTy = + CGM.getTypes().ConvertTypeForMem(StaticTy); + llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), + LLVMReductionsBufferTy->getPointerTo()); + + llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), + CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), + /*Volatile=*/false, C.IntTy, + Loc)}; + unsigned Idx = 0; + for (const Expr *Private : Privates) { + // Reduce element = LocalReduceList[i] + Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx); + llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( + ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + // elemptr = ((CopyType*)(elemptrptr)) + I + ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo()); + Address ElemPtr = + Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType())); + const ValueDecl *VD = cast(Private)->getDecl(); + // Global = Buffer.VD[Idx]; + const FieldDecl *FD = VarFieldMap.lookup(VD); + LValue GlobLVal = CGF.EmitLValueForField( + CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); + llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs); + GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment())); + switch (CGF.getEvaluationKind(Private->getType())) { + case TEK_Scalar: { + llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc); + CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType()); + break; + } + case TEK_Complex: { + CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc); + CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()), + /*isInit=*/false); + break; + } + case TEK_Aggregate: + CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()), + GlobLVal, Private->getType(), + AggValueSlot::DoesNotOverlap); + break; + } + ++Idx; + } + + CGF.FinishFunction(); + return Fn; +} + +/// This function emits a helper that reduces all the reduction variables from +/// the team into the provided global buffer for the reduction variables. +/// +/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data) +/// void *GlobPtrs[]; +/// GlobPtrs[0] = (void*)&buffer.D0[Idx]; +/// ... +/// GlobPtrs[N] = (void*)&buffer.DN[Idx]; +/// reduce_function(reduce_data, GlobPtrs); +static llvm::Value *emitGlobalToListReduceFunction( + CodeGenModule &CGM, ArrayRef Privates, + QualType ReductionArrayTy, SourceLocation Loc, + const RecordDecl *TeamReductionRec, + const llvm::SmallDenseMap + &VarFieldMap, + llvm::Function *ReduceFn) { + ASTContext &C = CGM.getContext(); + + // Buffer: global reduction buffer. + ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.VoidPtrTy, ImplicitParamDecl::Other); + // Idx: index of the buffer. + ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy, + ImplicitParamDecl::Other); + // ReduceList: thread local Reduce list. + ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, + C.VoidPtrTy, ImplicitParamDecl::Other); + FunctionArgList Args; + Args.push_back(&BufferArg); + Args.push_back(&IdxArg); + Args.push_back(&ReduceListArg); + + const CGFunctionInfo &CGFI = + CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + "_omp_reduction_global_to_list_reduce_func", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI); + Fn->setDoesNotRecurse(); + CodeGenFunction CGF(CGM); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc); + + CGBuilderTy &Bld = CGF.Builder; + + Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg); + QualType StaticTy = C.getRecordType(TeamReductionRec); + llvm::Type *LLVMReductionsBufferTy = + CGM.getTypes().ConvertTypeForMem(StaticTy); + llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc), + LLVMReductionsBufferTy->getPointerTo()); + + // 1. Build a list of reduction variables. + // void *RedList[] = {[0], ..., [-1]}; + Address ReductionList = + CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); + auto IPriv = Privates.begin(); + llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty), + CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg), + /*Volatile=*/false, C.IntTy, + Loc)}; + unsigned Idx = 0; + for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) { + Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); + // Global = Buffer.VD[Idx]; + const ValueDecl *VD = cast(*IPriv)->getDecl(); + const FieldDecl *FD = VarFieldMap.lookup(VD); + LValue GlobLVal = CGF.EmitLValueForField( + CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD); + llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs); + llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr); + CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy); + if ((*IPriv)->getType()->isVariablyModifiedType()) { + // Store array size. + ++Idx; + Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); + llvm::Value *Size = CGF.Builder.CreateIntCast( + CGF.getVLASize( + CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) + .NumElts, + CGF.SizeTy, /*isSigned=*/false); + CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), + Elem); + } + } + + // Call reduce_function(ReduceList, GlobalReduceList) + llvm::Value *GlobalReduceList = + CGF.EmitCastToVoidPtr(ReductionList.getPointer()); + Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); + llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar( + AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc); + CGM.getOpenMPRuntime().emitOutlinedFunctionCall( + CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList}); + CGF.FinishFunction(); + return Fn; +} + /// /// Design of OpenMP reductions on the GPU /// @@ -3833,55 +4272,55 @@ void CGOpenMPRuntimeNVPTX::emitReduction( llvm::Value *ThreadId = getThreadID(CGF, Loc); llvm::Value *Res; - if (ParallelReduction) { - ASTContext &C = CGM.getContext(); - // 1. Build a list of reduction variables. - // void *RedList[] = {[0], ..., [-1]}; - auto Size = RHSExprs.size(); - for (const Expr *E : Privates) { - if (E->getType()->isVariablyModifiedType()) - // Reserve place for array size. - ++Size; - } - llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size); - QualType ReductionArrayTy = - C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal, - /*IndexTypeQuals=*/0); - Address ReductionList = - CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); - auto IPriv = Privates.begin(); - unsigned Idx = 0; - for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) { - Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); - CGF.Builder.CreateStore( - CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy), - Elem); - if ((*IPriv)->getType()->isVariablyModifiedType()) { - // Store array size. - ++Idx; - Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); - llvm::Value *Size = CGF.Builder.CreateIntCast( - CGF.getVLASize( - CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) - .NumElts, - CGF.SizeTy, /*isSigned=*/false); - CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), - Elem); - } + ASTContext &C = CGM.getContext(); + // 1. Build a list of reduction variables. + // void *RedList[] = {[0], ..., [-1]}; + auto Size = RHSExprs.size(); + for (const Expr *E : Privates) { + if (E->getType()->isVariablyModifiedType()) + // Reserve place for array size. + ++Size; + } + llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size); + QualType ReductionArrayTy = + C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal, + /*IndexTypeQuals=*/0); + Address ReductionList = + CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); + auto IPriv = Privates.begin(); + unsigned Idx = 0; + for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) { + Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); + CGF.Builder.CreateStore( + CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy), + Elem); + if ((*IPriv)->getType()->isVariablyModifiedType()) { + // Store array size. + ++Idx; + Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx); + llvm::Value *Size = CGF.Builder.CreateIntCast( + CGF.getVLASize( + CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) + .NumElts, + CGF.SizeTy, /*isSigned=*/false); + CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), + Elem); } + } - llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy); - llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( - ReductionList.getPointer(), CGF.VoidPtrTy); - llvm::Function *ReductionFn = emitReductionFunction( - CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), - Privates, LHSExprs, RHSExprs, ReductionOps); - llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction( - CGM, Privates, ReductionArrayTy, ReductionFn, Loc); - llvm::Value *InterWarpCopyFn = - emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc); + llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + ReductionList.getPointer(), CGF.VoidPtrTy); + llvm::Function *ReductionFn = emitReductionFunction( + CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), + Privates, LHSExprs, RHSExprs, ReductionOps); + llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy); + llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction( + CGM, Privates, ReductionArrayTy, ReductionFn, Loc); + llvm::Value *InterWarpCopyFn = + emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc); + if (ParallelReduction) { llvm::Value *Args[] = {RTLoc, ThreadId, CGF.Builder.getInt32(RHSExprs.size()), @@ -3890,17 +4329,59 @@ void CGOpenMPRuntimeNVPTX::emitReduction( ShuffleAndReduceFn, InterWarpCopyFn}; - Res = CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2), - Args); + Res = CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2), + Args); } else { assert(TeamsReduction && "expected teams reduction."); - std::string Name = getName({"reduction"}); - llvm::Value *Lock = getCriticalRegionLock(Name); - llvm::Value *Args[] = {RTLoc, ThreadId, Lock}; + llvm::SmallDenseMap VarFieldMap; + llvm::SmallVector PrivatesReductions(Privates.size()); + int Cnt = 0; + for (const Expr *DRE : Privates) { + PrivatesReductions[Cnt] = cast(DRE)->getDecl(); + ++Cnt; + } + const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars( + CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap, + C.getLangOpts().OpenMPCUDAReductionBufNum); + TeamsReductions.push_back(TeamReductionRec); + if (!KernelTeamsReductionPtr) { + KernelTeamsReductionPtr = new llvm::GlobalVariable( + CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true, + llvm::GlobalValue::InternalLinkage, nullptr, + "_openmp_teams_reductions_buffer_$_$ptr"); + } + llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar( + Address(KernelTeamsReductionPtr, CGM.getPointerAlign()), + /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc); + llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction( + CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap); + llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction( + CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap, + ReductionFn); + llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction( + CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap); + llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction( + CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap, + ReductionFn); + + llvm::Value *Args[] = { + RTLoc, + ThreadId, + GlobalBufferPtr, + CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum), + RL, + ShuffleAndReduceFn, + InterWarpCopyFn, + GlobalToBufferCpyFn, + GlobalToBufferRedFn, + BufferToGlobalCpyFn, + BufferToGlobalRedFn}; + Res = CGF.EmitRuntimeCall( createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple), + OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2), Args); } @@ -3931,30 +4412,14 @@ void CGOpenMPRuntimeNVPTX::emitReduction( ++IRHS; } }; - if (ParallelReduction) { - llvm::Value *EndArgs[] = {ThreadId}; - RegionCodeGenTy RCG(CodeGen); - NVPTXActionTy Action( - nullptr, llvm::None, - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait), - EndArgs); - RCG.setAction(Action); - RCG(CGF); - } else { - assert(TeamsReduction && "expected teams reduction."); - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - std::string Name = getName({"reduction"}); - llvm::Value *Lock = getCriticalRegionLock(Name); - llvm::Value *EndArgs[] = {RTLoc, ThreadId, Lock}; - RegionCodeGenTy RCG(CodeGen); - NVPTXActionTy Action( - nullptr, llvm::None, - createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_nvptx_teams_end_reduce_nowait_simple), - EndArgs); - RCG.setAction(Action); - RCG(CGF); - } + llvm::Value *EndArgs[] = {ThreadId}; + RegionCodeGenTy RCG(CodeGen); + NVPTXActionTy Action( + nullptr, llvm::None, + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait), + EndArgs); + RCG.setAction(Action); + RCG(CGF); // There is no need to emit line number for unconditional branch. (void)ApplyDebugLocation::CreateEmpty(CGF); CGF.EmitBlock(ExitBB, /*IsFinished=*/true); @@ -4600,5 +5065,33 @@ void CGOpenMPRuntimeNVPTX::clear() { } } } + if (!TeamsReductions.empty()) { + ASTContext &C = CGM.getContext(); + RecordDecl *StaticRD = C.buildImplicitRecord( + "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union); + StaticRD->startDefinition(); + for (const RecordDecl *TeamReductionRec : TeamsReductions) { + QualType RecTy = C.getRecordType(TeamReductionRec); + auto *Field = FieldDecl::Create( + C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy, + C.getTrivialTypeSourceInfo(RecTy, SourceLocation()), + /*BW=*/nullptr, /*Mutable=*/false, + /*InitStyle=*/ICIS_NoInit); + Field->setAccess(AS_public); + StaticRD->addDecl(Field); + } + StaticRD->completeDefinition(); + QualType StaticTy = C.getRecordType(StaticRD); + llvm::Type *LLVMReductionsBufferTy = + CGM.getTypes().ConvertTypeForMem(StaticTy); + auto *GV = new llvm::GlobalVariable( + CGM.getModule(), LLVMReductionsBufferTy, + /*isConstant=*/false, llvm::GlobalValue::CommonLinkage, + llvm::Constant::getNullValue(LLVMReductionsBufferTy), + "_openmp_teams_reductions_buffer_$_"); + KernelTeamsReductionPtr->setInitializer( + llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, + CGM.VoidPtrTy)); + } CGOpenMPRuntime::clear(); } diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h index 644724b3e6..cc66c4659e 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -461,6 +461,12 @@ private: unsigned RegionCounter = 0; }; llvm::SmallVector GlobalizedRecords; + llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr; + /// List of the records with the list of fields for the reductions across the + /// teams. Used to build the intermediate buffer for the fast teams + /// reductions. + /// All the records are gathered into a union `union.type` is created. + llvm::SmallVector TeamsReductions; /// Shared pointer for the global memory in the global memory buffer used for /// the given kernel. llvm::GlobalVariable *KernelStaticGlobalized = nullptr; diff --git a/lib/Driver/ToolChains/Clang.cpp b/lib/Driver/ToolChains/Clang.cpp index 0057e5391b..e7fb3fe403 100644 --- a/lib/Driver/ToolChains/Clang.cpp +++ b/lib/Driver/ToolChains/Clang.cpp @@ -4464,6 +4464,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_version_EQ); Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_number_of_sm_EQ); Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_blocks_per_sm_EQ); + Args.AddAllArgs(CmdArgs, + options::OPT_fopenmp_cuda_teams_reduction_recs_num_EQ); if (Args.hasFlag(options::OPT_fopenmp_optimistic_collapse, options::OPT_fno_openmp_optimistic_collapse, /*Default=*/false)) diff --git a/lib/Frontend/CompilerInvocation.cpp b/lib/Frontend/CompilerInvocation.cpp index d4094ac5ae..1a33a00004 100644 --- a/lib/Frontend/CompilerInvocation.cpp +++ b/lib/Frontend/CompilerInvocation.cpp @@ -2840,6 +2840,9 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK, Opts.OpenMPCUDABlocksPerSM = getLastArgIntValue(Args, options::OPT_fopenmp_cuda_blocks_per_sm_EQ, Opts.OpenMPCUDABlocksPerSM, Diags); + Opts.OpenMPCUDAReductionBufNum = getLastArgIntValue( + Args, options::OPT_fopenmp_cuda_teams_reduction_recs_num_EQ, + Opts.OpenMPCUDAReductionBufNum, Diags); } // Prevent auto-widening the representation of loop counters during an diff --git a/test/Driver/openmp-offload-gpu.c b/test/Driver/openmp-offload-gpu.c index dfdc79b5f7..7a4dd95e54 100644 --- a/test/Driver/openmp-offload-gpu.c +++ b/test/Driver/openmp-offload-gpu.c @@ -273,3 +273,8 @@ // RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fopenmp-cuda-force-full-runtime -fno-openmp-cuda-force-full-runtime 2>&1 \ // RUN: | FileCheck -check-prefix=NO_FULL_RUNTIME %s // NO_FULL_RUNTIME-NOT: "-{{fno-|f}}openmp-cuda-force-full-runtime" + +// RUN: %clang -### -no-canonical-prefixes -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fopenmp-cuda-teams-reduction-recs-num=2048 2>&1 \ +// RUN: | FileCheck -check-prefix=CUDA_RED_RECS %s +// CUDA_RED_RECS: clang{{.*}}"-cc1"{{.*}}"-triple" "nvptx64-nvidia-cuda" +// CUDA_RED_RECS-SAME: "-fopenmp-cuda-teams-reduction-recs-num=2048" diff --git a/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/test/OpenMP/nvptx_teams_reduction_codegen.cpp index 0de25295a7..d5a275a554 100644 --- a/test/OpenMP/nvptx_teams_reduction_codegen.cpp +++ b/test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -3,28 +3,32 @@ // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 -// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 // expected-no-diagnostics #ifndef HEADER #define HEADER -// CHECK: [[MAP_TY:%.+]] = type { [128 x i8] } +// CHECK-DAG: [[TEAM1_REDUCE_TY:%.+]] = type { [{{1024|2048}} x double] } +// CHECK-DAG: [[TEAM2_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i8], [{{1024|2048}} x float] } +// CHECK-DAG: [[TEAM3_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i32], [{{1024|2048}} x i16] } +// CHECK-DAG: [[TEAMS_REDUCE_UNION_TY:%.+]] = type { [[TEAM1_REDUCE_TY]] } +// CHECK-DAG: [[MAP_TY:%.+]] = type { [128 x i8] } // CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null // CHECK-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1 // CHECK-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1 -// CHECK-DAG: [[KERNEL_SHARED3:@.+]] = internal unnamed_addr constant i16 1 // CHECK-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} {{16|8}} // CHECK-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} 16 -// CHECK-DAG: [[KERNEL_SIZE3:@.+]] = internal unnamed_addr constant i{{64|32}} 8 // Check for the data transfer medium in shared memory to transfer the reduction list to the first warp. // CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32] // Check that the execution mode of 2 target regions is set to Non-SPMD and the 3rd is in SPMD. -// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l41}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l47}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l54}}_exec_mode = weak constant i8 0 + +// CHECK-DAG: [[TEAMS_RED_BUFFER:@.+]] = common global [[TEAMS_REDUCE_UNION_TY]] zeroinitializer template tx ftemplate(int n) { @@ -66,9 +70,9 @@ int bar(int n){ return a; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l37}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l41}}_worker() - // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l37]]( + // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l41]]( // // CHECK: {{call|invoke}} void [[T1]]_worker() // @@ -78,7 +82,11 @@ int bar(int n){ // CHECK: [[EV:%.+]] = load double, double* [[E]], align // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5 // CHECK: store double [[ADD]], double* [[E]], align - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_simple(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], [8 x i32]* [[LOCK:@.+]]) + // CHECK: [[GEP1:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[BC:%.+]] = bitcast double* [[E]] to i8* + // CHECK: store i8* [[BC]], i8** [[GEP1]], + // CHECK: [[BC_RED_LIST:%.+]] = bitcast [1 x i8*]* [[RED_LIST]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* bitcast ([[TEAMS_REDUCE_UNION_TY]]* [[TEAMS_RED_BUFFER]] to i8*), i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]]) // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] // @@ -87,15 +95,250 @@ int bar(int n){ // CHECK: [[EV:%.+]] = load double, double* [[E]], align // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]] // CHECK: store double [[ADD]], double* [[E_IN]], align - // CHECK: call void @__kmpc_nvptx_teams_end_reduce_nowait_simple(%struct.ident_t* [[LOC]], i32 [[GTID]], [8 x i32]* [[LOCK]]) + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[GTID]]) // CHECK: br label %[[EXIT]] // // CHECK: [[EXIT]] // CHECK: call void @__kmpc_kernel_deinit( - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker() + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]], + // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double* + // + // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]], + // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double* + // + // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]], + // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]], + // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]] + // CHECK: store double [[RES]], double* [[VAR_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_AND_REDUCE]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [1 x i8*], align + // CHECK: [[REMOTE_ELT:%.+]] = alloca double + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // + // CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64* + // CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64* + // CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) + // + // CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align + // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8* + // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [1 x i8*]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [1 x i8*]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align + // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[INTER_WARP_COPY]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [1 x i8*]* + // CHECK: store i32 0, i32* [[CNT_ADDR:%.+]], + // CHECK: br label + // CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]], + // CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2 + // CHECK: br i1 [[DONE_COPY]], label + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]] + // + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], + // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ + // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]] + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], + // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1 + // CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]], + // CHECK: br label + // CHECK: ret + + // CHECK: define internal void [[RED_LIST_TO_GLOBAL_COPY]](i8*, i32, i8*) + // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, + // CHECK: [[IDX_PTR:%.+]] = alloca i32, + // CHECK: [[RL_PTR:%.+]] = alloca i8*, + // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], + // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], + // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], + // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], + // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [1 x i8*]* + // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], + // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]* + // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], + // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], + // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to double* + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[LOC_RED1:%.+]] = load double, double* [[RL_RED1]], + // CHECK: store double [[LOC_RED1]], double* [[GLOBAL_RED1_IDX_PTR]], + // CHECK: ret void + + // CHECK: define internal void [[RED_LIST_TO_GLOBAL_RED]](i8*, i32, i8*) + // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, + // CHECK: [[IDX_PTR:%.+]] = alloca i32, + // CHECK: [[RL_PTR:%.+]] = alloca i8*, + // CHECK: [[LOCAL_RL:%.+]] = alloca [1 x i8*], + // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], + // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], + // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], + // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], + // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]* + // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], + // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast double* [[GLOBAL_RED1_IDX_PTR]] to i8* + // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] + // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [1 x i8*]* [[LOCAL_RL]] to i8* + // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], + // CHECK: call void [[REDUCTION_FUNC]](i8* [[LOCAL_RL_BC]], i8* [[RL_BC]]) + // CHECK: ret void + + // CHECK: define internal void [[GLOBAL_TO_RED_LIST_COPY]](i8*, i32, i8*) + // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, + // CHECK: [[IDX_PTR:%.+]] = alloca i32, + // CHECK: [[RL_PTR:%.+]] = alloca i8*, + // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], + // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], + // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], + // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], + // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [1 x i8*]* + // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], + // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]* + // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], + // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], + // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to double* + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[GLOBAL_RED1:%.+]] = load double, double* [[GLOBAL_RED1_IDX_PTR]], + // CHECK: store double [[GLOBAL_RED1]], double* [[RL_RED1]], + // CHECK: ret void + + // CHECK: define internal void [[GLOBAL_TO_RED_LIST_RED]](i8*, i32, i8*) + // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, + // CHECK: [[IDX_PTR:%.+]] = alloca i32, + // CHECK: [[RL_PTR:%.+]] = alloca i8*, + // CHECK: [[LOCAL_RL:%.+]] = alloca [1 x i8*], + // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], + // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], + // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], + // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], + // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM1_REDUCE_TY]]* + // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], + // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM1_REDUCE_TY]], [[TEAM1_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x double], [{{1024|2048}} x double]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast double* [[GLOBAL_RED1_IDX_PTR]] to i8* + // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] + // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [1 x i8*]* [[LOCAL_RL]] to i8* + // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]]) + // CHECK: ret void + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l47}}_worker() - // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l43]]( + // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l47]]( // // CHECK: {{call|invoke}} void [[T2]]_worker() @@ -111,7 +354,13 @@ int bar(int n){ // CHECK: [[DV:%.+]] = load float, float* [[D]], align // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}} // CHECK: store float [[MUL]], float* [[D]], align - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_simple(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], [8 x i32]* [[LOCK:@.+]]) + // CHECK: [[GEP1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: store i8* [[C]], i8** [[GEP1]], + // CHECK: [[GEP2:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[BC:%.+]] = bitcast float* [[D]] to i8* + // CHECK: store i8* [[BC]], i8** [[GEP2]], + // CHECK: [[BC_RED_LIST:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* bitcast ([[TEAMS_REDUCE_UNION_TY]]* [[TEAMS_RED_BUFFER]] to i8*), i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]]) // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] // @@ -127,25 +376,349 @@ int bar(int n){ // CHECK: [[DV:%.+]] = load float, float* [[D]], align // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]] // CHECK: store float [[MUL]], float* [[D_IN]], align - // CHECK: call void @__kmpc_nvptx_teams_end_reduce_nowait_simple(%struct.ident_t* [[LOC]], i32 [[GTID]], [8 x i32]* [[LOCK]]) + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[GTID]]) // CHECK: br label %[[EXIT]] // // CHECK: [[EXIT]] // CHECK: call void @__kmpc_kernel_deinit( - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l50}}( + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], + // + // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], + // + // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], + // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float* + // + // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], + // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float* + // + // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]], + // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32 + // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]], + // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] + // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]], + // + // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]], + // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]], + // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] + // CHECK: store float [[RES]], float* [[VAR2_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_AND_REDUCE]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [2 x i8*], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca float + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align + // + // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8 + // + // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align + // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // + // CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32* + // CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32* + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) + // + // CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align + // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [2 x i8*]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align + // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align + // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[INTER_WARP_COPY]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [2 x i8*]* + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align + // CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ + // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ + // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: ret + + // CHECK: define internal void [[RED_LIST_TO_GLOBAL_COPY]](i8*, i32, i8*) + // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, + // CHECK: [[IDX_PTR:%.+]] = alloca i32, + // CHECK: [[RL_PTR:%.+]] = alloca i8*, + // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], + // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], + // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], + // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], + // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]* + // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], + // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]* + // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], + // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[RL_RED1:%.+]] = load i8*, i8** [[RL_RED1_PTR]], + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[LOC_RED1:%.+]] = load i8, i8* [[RL_RED1]], + // CHECK: store i8 [[LOC_RED1]], i8* [[GLOBAL_RED1_IDX_PTR]], + // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], + // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to float* + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[LOC_RED1:%.+]] = load float, float* [[RL_RED1]], + // CHECK: store float [[LOC_RED1]], float* [[GLOBAL_RED1_IDX_PTR]], + // CHECK: ret void + + // CHECK: define internal void [[RED_LIST_TO_GLOBAL_RED]](i8*, i32, i8*) + // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, + // CHECK: [[IDX_PTR:%.+]] = alloca i32, + // CHECK: [[RL_PTR:%.+]] = alloca i8*, + // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*], + // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], + // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], + // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], + // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], + // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]* + // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], + // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR]], i8** [[LOCAL_RL_RED1_PTR]] + // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast float* [[GLOBAL_RED1_IDX_PTR]] to i8* + // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] + // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8* + // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], + // CHECK: call void [[REDUCTION_FUNC]](i8* [[LOCAL_RL_BC]], i8* [[RL_BC]]) + // CHECK: ret void + + // CHECK: define internal void [[GLOBAL_TO_RED_LIST_COPY]](i8*, i32, i8*) + // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, + // CHECK: [[IDX_PTR:%.+]] = alloca i32, + // CHECK: [[RL_PTR:%.+]] = alloca i8*, + // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], + // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], + // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], + // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], + // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]* + // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], + // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]* + // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], + // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[RL_RED1:%.+]] = load i8*, i8** [[RL_RED1_PTR]], + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[GLOBAL_RED1:%.+]] = load i8, i8* [[GLOBAL_RED1_IDX_PTR]], + // CHECK: store i8 [[GLOBAL_RED1]], i8* [[RL_RED1]], + // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], + // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to float* + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[GLOBAL_RED1:%.+]] = load float, float* [[GLOBAL_RED1_IDX_PTR]], + // CHECK: store float [[GLOBAL_RED1]], float* [[RL_RED1]], + // CHECK: ret void + + // CHECK: define internal void [[GLOBAL_TO_RED_LIST_RED]](i8*, i32, i8*) + // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, + // CHECK: [[IDX_PTR:%.+]] = alloca i32, + // CHECK: [[RL_PTR:%.+]] = alloca i8*, + // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*], + // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], + // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], + // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], + // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], + // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM2_REDUCE_TY]]* + // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], + // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i8], [{{1024|2048}} x i8]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR]], i8** [[LOCAL_RL_RED1_PTR]] + // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM2_REDUCE_TY]], [[TEAM2_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x float], [{{1024|2048}} x float]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast float* [[GLOBAL_RED1_IDX_PTR]] to i8* + // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] + // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8* + // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]]) + // CHECK: ret void + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l54}}( // // CHECK: call void @__kmpc_spmd_kernel_init( // CHECK: call void @__kmpc_data_sharing_init_stack_spmd() - // CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY:%.+]], %{{.+}} addrspace(3)* [[KERNEL_RD:@.+]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} {{8|16}}, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR:@.+]] to i8**)) - // CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], - // CHECK: [[GLOBAL_REC:%.+]] = bitcast i8* [[PTR]] to [[GLOB_REC_TY:%.+]]* - // CHECK-DAG: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOB_REC_TY]], [[GLOB_REC_TY]]* [[GLOBAL_REC]], i32 0, i32 0 - // CHECK-DAG: [[B_ADDR:%.+]] = getelementptr inbounds [[GLOB_REC_TY]], [[GLOB_REC_TY]]* [[GLOBAL_REC]], i32 0, i32 1 - // CHECK: store i32 0, i32* [[A_ADDR]], - // CHECK: store i16 -32768, i16* [[B_ADDR]], + // CHECK-NOT: call void @__kmpc_get_team_static_memory + // CHECK: store i32 0, + // CHECK: store i32 0, i32* [[A_ADDR:%.+]], align + // CHECK: store i16 -32768, i16* [[B_ADDR:%.+]], align // CHECK: call void [[OUTLINED:@.+]](i32* {{.+}}, i32* {{.+}}, i32* [[A_ADDR]], i16* [[B_ADDR]]) - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_simple(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], [8 x i32]* [[LOCK:@.+]]) + // CHECK: [[GEP1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[BC:%.+]] = bitcast i32* [[A_ADDR]] to i8* + // CHECK: store i8* [[BC]], i8** [[GEP1]], + // CHECK: [[GEP2:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[BC:%.+]] = bitcast i16* [[B_ADDR]] to i8* + // CHECK: store i8* [[BC]], i8** [[GEP2]], + // CHECK: [[BC_RED_LIST:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* [[LOC:@.+]], i32 [[GTID:%.+]], i8* bitcast ([[TEAMS_REDUCE_UNION_TY]]* [[TEAMS_RED_BUFFER]] to i8*), i32 {{1024|2048}}, i8* [[BC_RED_LIST]], void (i8*, i16, i16, i16)* [[SHUFFLE_AND_REDUCE:@.+]], void (i8*, i32)* [[INTER_WARP_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_COPY:@.+]], void (i8*, i32, i8*)* [[RED_LIST_TO_GLOBAL_RED:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_COPY:@.+]], void (i8*, i32, i8*)* [[GLOBAL_TO_RED_LIST_RED:@.+]]) // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] // @@ -172,11 +745,10 @@ int bar(int n){ // CHECK: [[MAX_CONT]] // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align - // CHECK: call void @__kmpc_nvptx_teams_end_reduce_nowait_simple(%struct.ident_t* [[LOC]], i32 [[GTID]], [8 x i32]* [[LOCK]]) + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[GTID]]) // CHECK: br label %[[EXIT]] // // CHECK: [[EXIT]] - // CHECK: call void @__kmpc_restore_team_static_memory(i16 1, i16 1) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.+}}, i16* dereferenceable{{.+}}) @@ -475,4 +1047,346 @@ int bar(int n){ // CHECK: [[READ_CONT]] // CHECK: ret + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], + // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32* + // + // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], + // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32* + // + // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], + // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16* + // + // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], + // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16* + // + // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]], + // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]], + // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] + // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]], + // + // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]], + // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32 + // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]], + // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32 + // + // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] + // + // CHECK: [[DO_MAX]] + // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] + // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_AND_REDUCE]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [2 x i8*], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) + // + // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align + // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* + // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align + // + // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16 + // + // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [2 x i8*]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [2 x i8*]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align + // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align + // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[INTER_WARP_COPY]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ + // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // + // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align + // CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: call void @__kmpc_barrier(%struct.ident_t* @ + // CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: ret + + // CHECK: define internal void [[RED_LIST_TO_GLOBAL_COPY]](i8*, i32, i8*) + // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, + // CHECK: [[IDX_PTR:%.+]] = alloca i32, + // CHECK: [[RL_PTR:%.+]] = alloca i8*, + // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], + // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], + // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], + // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], + // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]* + // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], + // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]* + // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], + // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], + // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i32* + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[LOC_RED1:%.+]] = load i32, i32* [[RL_RED1]], + // CHECK: store i32 [[LOC_RED1]], i32* [[GLOBAL_RED1_IDX_PTR]], + // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], + // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i16* + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[LOC_RED1:%.+]] = load i16, i16* [[RL_RED1]], + // CHECK: store i16 [[LOC_RED1]], i16* [[GLOBAL_RED1_IDX_PTR]], + // CHECK: ret void + + // CHECK: define internal void [[RED_LIST_TO_GLOBAL_RED]](i8*, i32, i8*) + // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, + // CHECK: [[IDX_PTR:%.+]] = alloca i32, + // CHECK: [[RL_PTR:%.+]] = alloca i8*, + // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*], + // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], + // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], + // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], + // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], + // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]* + // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], + // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i32* [[GLOBAL_RED1_IDX_PTR]] to i8* + // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] + // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i16* [[GLOBAL_RED1_IDX_PTR]] to i8* + // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] + // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8* + // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], + // CHECK: call void [[REDUCTION_FUNC]](i8* [[LOCAL_RL_BC]], i8* [[RL_BC]]) + // CHECK: ret void + + // CHECK: define internal void [[GLOBAL_TO_RED_LIST_COPY]](i8*, i32, i8*) + // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, + // CHECK: [[IDX_PTR:%.+]] = alloca i32, + // CHECK: [[RL_PTR:%.+]] = alloca i8*, + // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], + // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], + // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], + // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], + // CHECK: [[RL:%.+]] = bitcast i8* [[RL_BC]] to [2 x i8*]* + // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], + // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]* + // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], + // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], + // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i32* + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[GLOBAL_RED1:%.+]] = load i32, i32* [[GLOBAL_RED1_IDX_PTR]], + // CHECK: store i32 [[GLOBAL_RED1]], i32* [[RL_RED1]], + // CHECK: [[RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[RL_RED1_BC:%.+]] = load i8*, i8** [[RL_RED1_PTR]], + // CHECK: [[RL_RED1:%.+]] = bitcast i8* [[RL_RED1_BC]] to i16* + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[GLOBAL_RED1:%.+]] = load i16, i16* [[GLOBAL_RED1_IDX_PTR]], + // CHECK: store i16 [[GLOBAL_RED1]], i16* [[RL_RED1]], + // CHECK: ret void + + // CHECK: define internal void [[GLOBAL_TO_RED_LIST_RED]](i8*, i32, i8*) + // CHECK: [[GLOBAL_PTR:%.+]] = alloca i8*, + // CHECK: [[IDX_PTR:%.+]] = alloca i32, + // CHECK: [[RL_PTR:%.+]] = alloca i8*, + // CHECK: [[LOCAL_RL:%.+]] = alloca [2 x i8*], + // CHECK: store i8* %{{.+}}, i8** [[GLOBAL_PTR]], + // CHECK: store i32 %{{.+}}, i32* [[IDX_PTR]], + // CHECK: store i8* %{{.+}}, i8** [[RL_PTR]], + // CHECK: [[GLOBAL_BC:%.+]] = load i8*, i8** [[GLOBAL_PTR]], + // CHECK: [[GLOBAL:%.+]] = bitcast i8* [[GLOBAL_BC]] to [[TEAM3_REDUCE_TY]]* + // CHECK: [[IDX:%.+]] = load i32, i32* [[IDX_PTR]], + // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i32], [{{1024|2048}} x i32]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i32* [[GLOBAL_RED1_IDX_PTR]] to i8* + // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] + // CHECK: [[LOCAL_RL_RED1_PTR:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[LOCAL_RL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[GLOBAL_RED1_PTR:%.+]] = getelementptr inbounds [[TEAM3_REDUCE_TY]], [[TEAM3_REDUCE_TY]]* [[GLOBAL]], i{{[0-9]+}} 0, i{{[0-9]+}} 1 + // CHECK: [[GLOBAL_RED1_IDX_PTR:%.+]] = getelementptr inbounds [{{1024|2048}} x i16], [{{1024|2048}} x i16]* [[GLOBAL_RED1_PTR]], i{{[0-9]+}} 0, i32 [[IDX]] + // CHECK: [[GLOBAL_RED1_IDX_PTR_BC:%.+]] = bitcast i16* [[GLOBAL_RED1_IDX_PTR]] to i8* + // CHECK: store i8* [[GLOBAL_RED1_IDX_PTR_BC]], i8** [[LOCAL_RL_RED1_PTR]] + // CHECK: [[LOCAL_RL_BC:%.+]] = bitcast [2 x i8*]* [[LOCAL_RL]] to i8* + // CHECK: [[RL_BC:%.+]] = load i8*, i8** [[RL_PTR]], + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]]) + // CHECK: ret void + #endif