From c452e7c5b13885446155f8ed4371a2656b545ac8 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 7 May 2018 14:50:05 +0000 Subject: [PATCH] [OPENMP, NVPTX] Added support for L2 parallelism. Added initial codegen for level 2, 3 etc. parallelism. Currently, all the second, the third etc. parallel regions will run sequentially. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@331642 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CGOpenMPRuntime.cpp | 7 - lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 443 ++++++++++++++---- lib/CodeGen/CGOpenMPRuntimeNVPTX.h | 48 +- lib/CodeGen/CodeGenModule.cpp | 13 +- test/OpenMP/nvptx_target_codegen.cpp | 88 +++- ...bute_parallel_for_generic_mode_codegen.cpp | 55 +-- 6 files changed, 459 insertions(+), 195 deletions(-) diff --git a/lib/CodeGen/CGOpenMPRuntime.cpp b/lib/CodeGen/CGOpenMPRuntime.cpp index 18b51ea0bc..9e14738dd8 100644 --- a/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/lib/CodeGen/CGOpenMPRuntime.cpp @@ -2764,13 +2764,6 @@ Address CGOpenMPRuntime::getAddrOfArtificialThreadPrivate(CodeGenFunction &CGF, CGM.getPointerAlign()); } -/// \brief Emits code for OpenMP 'if' clause using specified \a CodeGen -/// function. Here is the logic: -/// if (Cond) { -/// ThenGen(); -/// } else { -/// ElseGen(); -/// } void CGOpenMPRuntime::emitOMPIfClause(CodeGenFunction &CGF, const Expr *Cond, const RegionCodeGenTy &ThenGen, const RegionCodeGenTy &ElseGen) { diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 82be31f0f8..9e6f2b4b9a 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -93,6 +93,9 @@ enum OpenMPRTLFunctionNVPTX { OMPRTL_NVPTX__kmpc_end_sharing_variables, /// \brief Call to void __kmpc_get_shared_variables(void ***GlobalArgs) OMPRTL_NVPTX__kmpc_get_shared_variables, + /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 + /// global_tid); + OMPRTL_NVPTX__kmpc_parallel_level, }; /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. @@ -131,19 +134,17 @@ public: } }; -// A class to track the execution mode when codegening directives within -// a target region. The appropriate mode (generic/spmd) is set on entry -// to the target region and used by containing directives such as 'parallel' -// to emit optimized code. +/// A class to track the execution mode when codegening directives within +/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry +/// to the target region and used by containing directives such as 'parallel' +/// to emit optimized code. class ExecutionModeRAII { private: - CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode; - CGOpenMPRuntimeNVPTX::ExecutionMode &Mode; + bool SavedMode; + bool &Mode; public: - ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode, - CGOpenMPRuntimeNVPTX::ExecutionMode NewMode) - : Mode(Mode) { + ExecutionModeRAII(bool &Mode, bool NewMode) : Mode(Mode) { SavedMode = Mode; Mode = NewMode; } @@ -579,24 +580,171 @@ void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction( } bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const { - return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd; + return IsInSPMDExecutionMode; +} + +static CGOpenMPRuntimeNVPTX::DataSharingMode +getDataSharingMode(CodeGenModule &CGM) { + return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA + : CGOpenMPRuntimeNVPTX::Generic; } -static CGOpenMPRuntimeNVPTX::ExecutionMode -getExecutionMode(CodeGenModule &CGM) { - return CGM.getLangOpts().OpenMPCUDAMode - ? CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd - : CGOpenMPRuntimeNVPTX::ExecutionMode::Generic; +/// Check for inner (nested) SPMD construct, if any +static bool hasNestedSPMDDirective(const OMPExecutableDirective &D) { + const auto *CS = D.getCapturedStmt(OMPD_target); + const auto *Body = CS->getCapturedStmt()->IgnoreContainers(); + const Stmt *ChildStmt = nullptr; + if (const auto *C = dyn_cast(Body)) + if (C->size() == 1) + ChildStmt = C->body_front(); + if (!ChildStmt) + return false; + + if (const auto *NestedDir = dyn_cast(ChildStmt)) { + OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); + // TODO: add further analysis for inner teams|distribute directives, if any. + switch (D.getDirectiveKind()) { + case OMPD_target: + return (isOpenMPParallelDirective(DKind) && + !isOpenMPTeamsDirective(DKind) && + !isOpenMPDistributeDirective(DKind)) || + isOpenMPSimdDirective(DKind) || + DKind == OMPD_teams_distribute_parallel_for; + case OMPD_target_teams: + return (isOpenMPParallelDirective(DKind) && + !isOpenMPDistributeDirective(DKind)) || + isOpenMPSimdDirective(DKind) || + DKind == OMPD_distribute_parallel_for; + case OMPD_target_teams_distribute: + return isOpenMPParallelDirective(DKind) || isOpenMPSimdDirective(DKind); + case OMPD_target_simd: + case OMPD_target_parallel: + case OMPD_target_parallel_for: + case OMPD_target_parallel_for_simd: + case OMPD_target_teams_distribute_simd: + case OMPD_target_teams_distribute_parallel_for: + case OMPD_target_teams_distribute_parallel_for_simd: + case OMPD_parallel: + case OMPD_for: + case OMPD_parallel_for: + case OMPD_parallel_sections: + case OMPD_for_simd: + case OMPD_parallel_for_simd: + case OMPD_cancel: + case OMPD_cancellation_point: + case OMPD_ordered: + case OMPD_threadprivate: + case OMPD_task: + case OMPD_simd: + case OMPD_sections: + case OMPD_section: + case OMPD_single: + case OMPD_master: + case OMPD_critical: + case OMPD_taskyield: + case OMPD_barrier: + case OMPD_taskwait: + case OMPD_taskgroup: + case OMPD_atomic: + case OMPD_flush: + case OMPD_teams: + case OMPD_target_data: + case OMPD_target_exit_data: + case OMPD_target_enter_data: + case OMPD_distribute: + case OMPD_distribute_simd: + case OMPD_distribute_parallel_for: + case OMPD_distribute_parallel_for_simd: + case OMPD_teams_distribute: + case OMPD_teams_distribute_simd: + case OMPD_teams_distribute_parallel_for: + case OMPD_teams_distribute_parallel_for_simd: + case OMPD_target_update: + case OMPD_declare_simd: + case OMPD_declare_target: + case OMPD_end_declare_target: + case OMPD_declare_reduction: + case OMPD_taskloop: + case OMPD_taskloop_simd: + case OMPD_unknown: + llvm_unreachable("Unexpected directive."); + } + } + + return false; +} + +static bool supportsSPMDExecutionMode(const OMPExecutableDirective &D) { + OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); + switch (DirectiveKind) { + case OMPD_target: + case OMPD_target_teams: + case OMPD_target_teams_distribute: + return hasNestedSPMDDirective(D); + case OMPD_target_simd: + case OMPD_target_parallel: + case OMPD_target_parallel_for: + case OMPD_target_parallel_for_simd: + case OMPD_target_teams_distribute_simd: + case OMPD_target_teams_distribute_parallel_for: + case OMPD_target_teams_distribute_parallel_for_simd: + return true; + case OMPD_parallel: + case OMPD_for: + case OMPD_parallel_for: + case OMPD_parallel_sections: + case OMPD_for_simd: + case OMPD_parallel_for_simd: + case OMPD_cancel: + case OMPD_cancellation_point: + case OMPD_ordered: + case OMPD_threadprivate: + case OMPD_task: + case OMPD_simd: + case OMPD_sections: + case OMPD_section: + case OMPD_single: + case OMPD_master: + case OMPD_critical: + case OMPD_taskyield: + case OMPD_barrier: + case OMPD_taskwait: + case OMPD_taskgroup: + case OMPD_atomic: + case OMPD_flush: + case OMPD_teams: + case OMPD_target_data: + case OMPD_target_exit_data: + case OMPD_target_enter_data: + case OMPD_distribute: + case OMPD_distribute_simd: + case OMPD_distribute_parallel_for: + case OMPD_distribute_parallel_for_simd: + case OMPD_teams_distribute: + case OMPD_teams_distribute_simd: + case OMPD_teams_distribute_parallel_for: + case OMPD_teams_distribute_parallel_for_simd: + case OMPD_target_update: + case OMPD_declare_simd: + case OMPD_declare_target: + case OMPD_end_declare_target: + case OMPD_declare_reduction: + case OMPD_taskloop: + case OMPD_taskloop_simd: + case OMPD_unknown: + break; + } + llvm_unreachable( + "Unknown programming model for OpenMP directive on NVPTX target."); } -void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D, +void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { - ExecutionModeRAII ModeRAII(CurrentExecutionMode, - CGOpenMPRuntimeNVPTX::ExecutionMode::Generic); + ExecutionModeRAII ModeRAII(IsInSPMDExecutionMode, /*NewMode=*/false); EntryFunctionState EST; WorkerFunctionState WST(CGM, D.getLocStart()); Work.clear(); @@ -613,11 +761,11 @@ void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D, : EST(EST), WST(WST) {} void Enter(CodeGenFunction &CGF) override { static_cast(CGF.CGM.getOpenMPRuntime()) - .emitGenericEntryHeader(CGF, EST, WST); + .emitNonSPMDEntryHeader(CGF, EST, WST); } void Exit(CodeGenFunction &CGF) override { static_cast(CGF.CGM.getOpenMPRuntime()) - .emitGenericEntryFooter(CGF, EST); + .emitNonSPMDEntryFooter(CGF, EST); } } Action(EST, WST); CodeGen.setAction(Action); @@ -633,7 +781,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D, } // Setup NVPTX threads for master-worker OpenMP scheme. -void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF, +void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, WorkerFunctionState &WST) { CGBuilderTy &Bld = CGF.Builder; @@ -657,6 +805,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF, Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); CGF.EmitBlock(MasterBB); + IsInTargetMasterThreadRegion = true; // SEQUENTIAL (MASTER) REGION START // First action in sequential region: // Initialize the state of the OpenMP runtime library on the GPU. @@ -674,12 +823,14 @@ void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF, emitGenericVarsProlog(CGF, WST.Loc); } -void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF, +void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST) { - emitGenericVarsEpilog(CGF); + IsInTargetMasterThreadRegion = false; if (!CGF.HaveInsertPoint()) return; + emitGenericVarsEpilog(CGF); + if (!EST.ExitBB) EST.ExitBB = CGF.createBasicBlock(".exit"); @@ -707,8 +858,7 @@ void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { - ExecutionModeRAII ModeRAII(CurrentExecutionMode, - CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd); + ExecutionModeRAII ModeRAII(IsInSPMDExecutionMode, /*NewMode=*/true); EntryFunctionState EST; // Emit target region as a standalone region. @@ -754,10 +904,17 @@ void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader( CGF.EmitBranch(ExecuteBB); CGF.EmitBlock(ExecuteBB); + + emitGenericVarsProlog(CGF, D.getLocStart()); } void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST) { + if (!CGF.HaveInsertPoint()) + return; + + emitGenericVarsEpilog(CGF); + if (!EST.ExitBB) EST.ExitBB = CGF.createBasicBlock(".exit"); @@ -781,11 +938,12 @@ void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF, // 'generic', the runtime reserves one warp for the master, otherwise, all // warps participate in parallel work. static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, - CGOpenMPRuntimeNVPTX::ExecutionMode Mode) { - auto *GVMode = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, - llvm::GlobalValue::WeakAnyLinkage, - llvm::ConstantInt::get(CGM.Int8Ty, Mode), Twine(Name, "_exec_mode")); + bool Mode) { + auto *GVMode = + new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, + llvm::GlobalValue::WeakAnyLinkage, + llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1), + Twine(Name, "_exec_mode")); CGM.addCompilerUsedGlobal(GVMode); } @@ -846,8 +1004,8 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus); // On termination condition (workid == 0), exit loop. - llvm::Value *ShouldTerminate = - Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate"); + llvm::Value *WorkID = Bld.CreateLoad(WorkFn); + llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate"); Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB); // Activate requested workers. @@ -886,6 +1044,22 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, CGF.EmitBlock(CheckNextBB); } + // Default case: call to outlined function through pointer if the target + // region makes a declare target call that may contain an orphaned parallel + // directive. + auto *ParallelFnTy = + llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty}, + /*isVarArg=*/false) + ->getPointerTo(); + llvm::Value *WorkFnCast = Bld.CreateBitCast(WorkID, ParallelFnTy); + // Insert call to work function via shared wrapper. The shared + // wrapper takes two arguments: + // - the parallelism level; + // - the thread ID; + emitCall(CGF, WST.Loc, WorkFnCast, + {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)}); + // Go to end of parallel region. + CGF.EmitBranch(TerminateBB); // Signal end of parallel region. CGF.EmitBlock(TerminateBB); @@ -1163,6 +1337,14 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables"); break; } + case OMPRTL_NVPTX__kmpc_parallel_level: { + // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid); + llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; + auto *FnTy = + llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level"); + break; + } } return RTLFn; } @@ -1198,27 +1380,19 @@ void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction( assert(!ParentName.empty() && "Invalid target region parent name!"); - CGOpenMPRuntimeNVPTX::ExecutionMode Mode = getExecutionMode(CGM); - switch (Mode) { - case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic: - emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, - CodeGen); - break; - case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd: + bool Mode = supportsSPMDExecutionMode(D); + if (Mode) emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); - break; - case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown: - llvm_unreachable( - "Unknown programming model for OpenMP directive on NVPTX target."); - } + else + emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode); } CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) - : CGOpenMPRuntime(CGM, "_", "$"), - CurrentExecutionMode(ExecutionMode::Unknown) { + : CGOpenMPRuntime(CGM, "_", "$") { if (!CGM.getLangOpts().OpenMPIsDevice) llvm_unreachable("OpenMP NVPTX can only handle device code."); } @@ -1258,23 +1432,32 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction( // Emit target region as a standalone region. class NVPTXPrePostActionTy : public PrePostActionTy { SourceLocation &Loc; + bool &IsInParallelRegion; + bool PrevIsInParallelRegion; public: - NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {} + NVPTXPrePostActionTy(SourceLocation &Loc, bool &IsInParallelRegion) + : Loc(Loc), IsInParallelRegion(IsInParallelRegion) {} void Enter(CodeGenFunction &CGF) override { static_cast(CGF.CGM.getOpenMPRuntime()) .emitGenericVarsProlog(CGF, Loc); + PrevIsInParallelRegion = IsInParallelRegion; + IsInParallelRegion = true; } void Exit(CodeGenFunction &CGF) override { + IsInParallelRegion = PrevIsInParallelRegion; static_cast(CGF.CGM.getOpenMPRuntime()) .emitGenericVarsEpilog(CGF); } - } Action(Loc); + } Action(Loc, IsInParallelRegion); CodeGen.setAction(Action); + bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion; + IsInTargetMasterThreadRegion = false; auto *OutlinedFun = cast(CGOpenMPRuntime::emitParallelOutlinedFunction( D, ThreadIDVar, InnermostKind, CodeGen)); - if (!isInSpmdExecutionMode()) { + IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion; + if (!isInSpmdExecutionMode() && !IsInParallelRegion) { llvm::Function *WrapperFun = createParallelDataSharingWrapper(OutlinedFun, D); WrapperFunctionsMap[OutlinedFun] = WrapperFun; @@ -1316,6 +1499,9 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc) { + if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic) + return; + CGBuilderTy &Bld = CGF.Builder; const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); @@ -1402,6 +1588,9 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, } void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF) { + if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic) + return; + const auto I = FunctionGlobalizedDecls.find(CGF.CurFn); if (I != FunctionGlobalizedDecls.end()) { I->getSecond().MappedParams->restore(CGF); @@ -1449,31 +1638,61 @@ void CGOpenMPRuntimeNVPTX::emitParallelCall( if (isInSpmdExecutionMode()) emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); else - emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); + emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); } -void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( +void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall( CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef CapturedVars, const Expr *IfCond) { llvm::Function *Fn = cast(OutlinedFn); - llvm::Function *WFn = WrapperFunctionsMap[Fn]; - - assert(WFn && "Wrapper function does not exist!"); // Force inline this outlined function at its call site. Fn->setLinkage(llvm::GlobalValue::InternalLinkage); - auto &&L0ParallelGen = [this, WFn, CapturedVars](CodeGenFunction &CGF, - PrePostActionTy &) { - CGBuilderTy &Bld = CGF.Builder; + Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth( + /*DestWidth=*/32, /*Signed=*/1), + ".zero.addr"); + CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); + Address ThreadIDAddr = emitThreadIDAddress(CGF, Loc); + auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, ThreadIDAddr]( + CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); + + llvm::SmallVector OutlinedFnArgs; + OutlinedFnArgs.push_back(ThreadIDAddr.getPointer()); + OutlinedFnArgs.push_back(ZeroAddr.getPointer()); + OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); + emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs); + }; + auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF, + PrePostActionTy &) { + + RegionCodeGenTy RCG(CodeGen); + llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); + llvm::Value *ThreadID = getThreadID(CGF, Loc); + llvm::Value *Args[] = {RTLoc, ThreadID}; + + NVPTXActionTy Action( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel), + Args, + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel), + Args); + RCG.setAction(Action); + RCG(CGF); + }; + auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF, + PrePostActionTy &Action) { + CGBuilderTy &Bld = CGF.Builder; + llvm::Function *WFn = WrapperFunctionsMap[Fn]; + assert(WFn && "Wrapper function does not exist!"); llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); // Prepare for parallel region. Indicate the outlined function. llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)}; - CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), - Args); + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), + Args); // Create a private scope that will globalize the arguments // passed from the outside of the target region. @@ -1496,13 +1715,13 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( // Store variable address in a list of references to pass to workers. unsigned Idx = 0; ASTContext &Ctx = CGF.getContext(); - Address SharedArgListAddress = CGF.EmitLoadOfPointer(SharedArgs, - Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy)) - .castAs()); + Address SharedArgListAddress = CGF.EmitLoadOfPointer( + SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy)) + .castAs()); for (llvm::Value *V : CapturedVars) { - Address Dst = Bld.CreateConstInBoundsGEP( - SharedArgListAddress, Idx, CGF.getPointerSize()); - llvm::Value * PtrV; + Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx, + CGF.getPointerSize()); + llvm::Value *PtrV; if (V->getType()->isIntegerTy()) PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); else @@ -1533,43 +1752,67 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( Work.emplace_back(WFn); }; - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadID = getThreadID(CGF, Loc); - llvm::Value *Args[] = {RTLoc, ThreadID}; - - auto &&SeqGen = [this, Fn, CapturedVars, &Args, Loc](CodeGenFunction &CGF, - PrePostActionTy &) { - auto &&CodeGen = [this, Fn, CapturedVars, Loc](CodeGenFunction &CGF, - PrePostActionTy &Action) { - Action.Enter(CGF); - - llvm::SmallVector OutlinedFnArgs; - Address ZeroAddr = - CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth( - /*DestWidth=*/32, /*Signed=*/1), - ".zero.addr"); - CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); - OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer()); - OutlinedFnArgs.push_back(ZeroAddr.getPointer()); - OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); - emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs); - }; - + auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen, &CodeGen]( + CodeGenFunction &CGF, PrePostActionTy &Action) { RegionCodeGenTy RCG(CodeGen); - NVPTXActionTy Action( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel), - Args, - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel), - Args); - RCG.setAction(Action); - RCG(CGF); + if (IsInParallelRegion) { + SeqGen(CGF, Action); + } else if (IsInTargetMasterThreadRegion) { + L0ParallelGen(CGF, Action); + } else { + // Check for master and then parallelism: + // if (is_master) { + // Worker call. + // } else if (__kmpc_parallel_level(loc, gtid)) { + // Serialized execution. + // } else { + // Outlined function call. + // } + CGBuilderTy &Bld = CGF.Builder; + llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); + if (!isInSpmdExecutionMode()) { + llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck"); + llvm::BasicBlock *ParallelCheckBB = + CGF.createBasicBlock(".parallelcheck"); + llvm::Value *IsMaster = + Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF)); + Bld.CreateCondBr(IsMaster, MasterCheckBB, ParallelCheckBB); + CGF.EmitBlock(MasterCheckBB); + L0ParallelGen(CGF, Action); + CGF.EmitBranch(ExitBB); + // There is no need to emit line number for unconditional branch. + (void)ApplyDebugLocation::CreateEmpty(CGF); + CGF.EmitBlock(ParallelCheckBB); + } + llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); + llvm::Value *ThreadID = getThreadID(CGF, Loc); + llvm::Value *PL = CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level), + {RTLoc, ThreadID}); + llvm::Value *Res = Bld.CreateIsNotNull(PL); + llvm::BasicBlock *ThenBlock = CGF.createBasicBlock("omp_if.then"); + llvm::BasicBlock *ElseBlock = CGF.createBasicBlock("omp_if.else"); + Bld.CreateCondBr(Res, ThenBlock, ElseBlock); + // Emit the 'then' code. + CGF.EmitBlock(ThenBlock); + SeqGen(CGF, Action); + // There is no need to emit line number for unconditional branch. + (void)ApplyDebugLocation::CreateEmpty(CGF); + // Emit the 'else' code. + CGF.EmitBlock(ElseBlock); + RCG(CGF); + // There is no need to emit line number for unconditional branch. + (void)ApplyDebugLocation::CreateEmpty(CGF); + // Emit the continuation block for code after the if. + CGF.EmitBlock(ExitBB, /*IsFinished=*/true); + } }; if (IfCond) { - emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen); + emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen); } else { CodeGenFunction::RunCleanupsScope Scope(CGF); - RegionCodeGenTy ThenRCG(L0ParallelGen); + RegionCodeGenTy ThenRCG(LNParallelGen); ThenRCG(CGF); } } @@ -3090,6 +3333,9 @@ llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper( void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) { + if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic) + return; + assert(D && "Expected function or captured|block decl."); assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && "Function is registered already."); @@ -3143,6 +3389,9 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) { + if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic) + return Address::invalid(); + VD = VD->getCanonicalDecl(); auto I = FunctionGlobalizedDecls.find(CGF.CurFn); if (I == FunctionGlobalizedDecls.end()) diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h index 365d4f52aa..a5f39c28a7 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -25,7 +25,7 @@ namespace CodeGen { class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime { private: - // Parallel outlined function work for workers to execute. + /// Parallel outlined function work for workers to execute. llvm::SmallVector Work; struct EntryFunctionState { @@ -52,14 +52,14 @@ private: /// \brief Helper for worker function. Emit body of worker loop. void emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST); - /// \brief Helper for generic target entry function. Guide the master and + /// \brief Helper for non-SPMD target entry function. Guide the master and /// worker threads to their respective locations. - void emitGenericEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, + void emitNonSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, WorkerFunctionState &WST); - /// \brief Signal termination of OMP execution for generic target entry + /// \brief Signal termination of OMP execution for non-SPMD target entry /// function. - void emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); + void emitNonSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); /// Helper for generic variables globalization prolog. void emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc); @@ -93,7 +93,7 @@ private: /// \param IsOffloadEntry True if the outlined function is an offload entry. /// An outlined function may not be an entry if, e.g. the if clause always /// evaluates to false. - void emitGenericKernel(const OMPExecutableDirective &D, StringRef ParentName, + void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen); @@ -133,14 +133,14 @@ private: /// \brief Emits code for parallel or serial call of the \a OutlinedFn with /// variables captured in a record which address is stored in \a /// CapturedStruct. - /// This call is for the Generic Execution Mode. + /// This call is for the Non-SPMD Execution Mode. /// \param OutlinedFn Outlined function to be run in parallel threads. Type of /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). /// \param CapturedVars A pointer to the record with the references to /// variables used in \a OutlinedFn function. /// \param IfCond Condition in the associated 'if' clause, if it was /// specified, nullptr otherwise. - void emitGenericParallelCall(CodeGenFunction &CGF, SourceLocation Loc, + void emitNonSPMDParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef CapturedVars, const Expr *IfCond); @@ -304,15 +304,15 @@ public: Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override; - /// Target codegen is specialized based on two programming models: the - /// 'generic' fork-join model of OpenMP, and a more GPU efficient 'spmd' - /// model for constructs like 'target parallel' that support it. - enum ExecutionMode { - /// Single Program Multiple Data. - Spmd, - /// Generic codegen to support fork-join model. + /// Target codegen is specialized based on two data-sharing modes: CUDA, in + /// which the local variables are actually global threadlocal, and Generic, in + /// which the local variables are placed in global memory if they may escape + /// their declaration context. + enum DataSharingMode { + /// CUDA data sharing mode. + CUDA, + /// Generic data-sharing mode. Generic, - Unknown, }; /// Cleans up references to the objects in finished function. @@ -320,11 +320,17 @@ public: void functionFinished(CodeGenFunction &CGF) override; private: - // Track the execution mode when codegening directives within a target - // region. The appropriate mode (generic/spmd) is set on entry to the - // target region and used by containing directives such as 'parallel' - // to emit optimized code. - ExecutionMode CurrentExecutionMode; + /// Track the execution mode when codegening directives within a target + /// region. The appropriate mode (SPMD/NON-SPMD) is set on entry to the + /// target region and used by containing directives such as 'parallel' + /// to emit optimized code. + bool IsInSPMDExecutionMode = false; + + /// true if we're emitting the code for the target region and next parallel + /// region is L0 for sure. + bool IsInTargetMasterThreadRegion = false; + /// true if we're definitely in the parallel region. + bool IsInParallelRegion = false; /// Map between an outlined function and its wrapper. llvm::DenseMap WrapperFunctionsMap; diff --git a/lib/CodeGen/CodeGenModule.cpp b/lib/CodeGen/CodeGenModule.cpp index 6e3747a188..4bb3c7b0d3 100644 --- a/lib/CodeGen/CodeGenModule.cpp +++ b/lib/CodeGen/CodeGenModule.cpp @@ -2399,8 +2399,17 @@ llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction( // For the device mark the function as one that should be emitted. if (getLangOpts().OpenMPIsDevice && OpenMPRuntime && !OpenMPRuntime->markAsGlobalTarget(GD) && FD->isDefined() && - !DontDefer && !IsForDefinition) - addDeferredDeclToEmit(GD); + !DontDefer && !IsForDefinition) { + const FunctionDecl *FDDef = FD->getDefinition(); + GlobalDecl GDDef; + if (const auto *CD = dyn_cast(FDDef)) + GDDef = GlobalDecl(CD, GD.getCtorType()); + else if (const auto *DD = dyn_cast(FDDef)) + GDDef = GlobalDecl(DD, GD.getDtorType()); + else + GDDef = GlobalDecl(FDDef); + addDeferredDeclToEmit(GDDef); + } if (FD->isMultiVersion() && FD->getAttr()->isDefaultVersion()) { UpdateMultiVersionNames(GD, FD); diff --git a/test/OpenMP/nvptx_target_codegen.cpp b/test/OpenMP/nvptx_target_codegen.cpp index 70f3973acd..718c650bec 100644 --- a/test/OpenMP/nvptx_target_codegen.cpp +++ b/test/OpenMP/nvptx_target_codegen.cpp @@ -9,15 +9,17 @@ #define HEADER // Check that the execution mode of all 6 target regions is set to Generic Mode. -// CHECK-DAG: {{@__omp_offloading_.+l100}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l177}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l287}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l324}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l342}}_exec_mode = weak constant i8 1 -// CHECK-DAG: {{@__omp_offloading_.+l307}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l102}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l179}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l289}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l326}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l344}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l309}}_exec_mode = weak constant i8 1 __thread int id; +int baz(int f); + template struct TT{ tx X; @@ -33,7 +35,7 @@ int foo(int n) { double cn[5][n]; TT d; - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l100}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l102}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -64,7 +66,7 @@ int foo(int n) { // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l100]]() + // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l102]]() // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() @@ -106,7 +108,7 @@ int foo(int n) { { } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l177}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l179}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -137,7 +139,7 @@ int foo(int n) { // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l177]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]], i[[SZ:32|64]] [[ID:%[a-zA-Z_]+]]) + // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l179]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]], i[[SZ:32|64]] [[ID:%[a-zA-Z_]+]]) // CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], // CHECK: store i[[SZ]] [[ARG1]], i[[SZ]]* [[AA_ADDR]], // CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16* @@ -180,7 +182,7 @@ int foo(int n) { id = aa; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l287}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l289}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -211,7 +213,7 @@ int foo(int n) { // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l287]](i[[SZ]] + // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l289]](i[[SZ]] // Create local storage for each capture. // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] // CHECK: [[LOCAL_B:%.+]] = alloca [10 x float]* @@ -343,6 +345,7 @@ struct S1 { { this->a = (double)b + 1.5; c[1][1] = ++a; + baz(a); } return c[1][1] + (int)b; @@ -364,7 +367,13 @@ int bar(int n){ return a; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+324}}_worker() +int baz(int f) { +#pragma omp parallel + f = 2; + return f; +} + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+326}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -395,7 +404,7 @@ int bar(int n){ // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l324]](i[[SZ]] + // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l326]](i[[SZ]] // Create local storage for each capture. // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] // CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]] @@ -450,9 +459,10 @@ int bar(int n){ - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l342}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l344}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, + // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* // CHECK: store i8* null, i8** [[OMP_WORK_FN]], // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], // CHECK: br label {{%?}}[[AWAIT_WORK:.+]] @@ -469,6 +479,8 @@ int bar(int n){ // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] // // CHECK: [[EXEC_PARALLEL]] + // CHECK: [[WORK_FN:%.+]] = bitcast i8* [[WORK]] to void (i16, i32)* + // CHECK: call void [[WORK_FN]](i16 0, i32 [[GTID]]) // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] // // CHECK: [[TERM_PARALLEL]] @@ -481,7 +493,7 @@ int bar(int n){ // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l342]]( + // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l344]]( // Create local storage for each capture. // CHECK: [[LOCAL_THIS:%.+]] = alloca [[S1:%struct.*]]* // CHECK: [[LOCAL_B:%.+]] = alloca i[[SZ]] @@ -528,6 +540,7 @@ int bar(int n){ // CHECK-64-DAG:load i32, i32* [[REF_B]] // CHECK-32-DAG:load i32, i32* [[LOCAL_B]] // CHECK-DAG: getelementptr inbounds i16, i16* [[REF_C]], i[[SZ]] %{{.+}} + // CHECK: call i32 [[BAZ:@.*baz.*]](i32 % // CHECK: br label {{%?}}[[TERMINATE:.+]] // // CHECK: [[TERMINATE]] @@ -538,9 +551,48 @@ int bar(int n){ // CHECK: [[EXIT]] // CHECK: ret void + // CHECK: define i32 [[BAZ]](i32 [[F:%.*]]) + // CHECK: [[ZERO_ADDR:%.+]] = alloca i32, + // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* + // CHECK: [[GTID_ADDR:%.+]] = alloca i32, + // CHECK: store i32 0, i32* [[ZERO_ADDR]] + // CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 4, i16 0) + // CHECK: [[REC_ADDR:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty* + // CHECK: [[F_PTR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[REC_ADDR]], i32 0, i32 0 + // CHECK: store i32 [[F]], i32* [[F_PTR]], + // CHECK: store i32 [[GTID]], i32* [[GTID_ADDR]], + // CHECK: icmp eq i32 + // CHECK: br i1 + + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @{{.+}} to i8*), i16 1) + // CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[SHARED_PTR:%.+]], i{{64|32}} 1) + // CHECK: [[SHARED:%.+]] = load i8**, i8*** [[SHARED_PTR]], + // CHECK: [[REF:%.+]] = getelementptr inbounds i8*, i8** [[SHARED]], i{{64|32}} 0 + // CHECK: [[F_REF:%.+]] = bitcast i32* [[F_PTR]] to i8* + // CHECK: store i8* [[F_REF]], i8** [[REF]], + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: call void @llvm.nvvm.barrier0() + // CHECK: call void @__kmpc_end_sharing_variables() + // CHECK: br label + + // CHECK: [[RES:%.+]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @{{.+}}, i32 [[GTID]]) + // CHECK: icmp ne i16 [[RES]], 0 + // CHECK: br i1 + + // CHECK: call void @__kmpc_serialized_parallel(%struct.ident_t* @{{.+}}, i32 [[GTID]]) + // CHECK: call void @__omp_outlined__(i32* [[GTID_ADDR]], i32* [[ZERO_ADDR]], i32* [[F_PTR]]) + // CHECK: call void @__kmpc_end_serialized_parallel(%struct.ident_t* @{{.+}}, i32 [[GTID]]) + // CHECK: br label + + // CHECK: call void @__omp_outlined__(i32* [[GTID_ADDR]], i32* [[ZERO_ADDR]], i32* [[F_PTR]]) + // CHECK: br label + + // CHECK: [[RES:%.+]] = load i32, i32* [[F_PTR]], + // CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[PTR]]) + // CHECK: ret i32 [[RES]] - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l307}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l309}}_worker() // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, // CHECK: store i8* null, i8** [[OMP_WORK_FN]], @@ -571,7 +623,7 @@ int bar(int n){ // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l307]](i[[SZ]] + // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l309]](i[[SZ]] // Create local storage for each capture. // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] // CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]] diff --git a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp index ac1aae840c..7594d6f2d2 100644 --- a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp +++ b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp @@ -19,45 +19,20 @@ int main(int argc, char **argv) { return 0; } -// CHECK: define internal void @__omp_offloading_{{.*}}_main_l[[@LINE-6]]_worker() -// CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @ -// CHECK: call void [[PARALLEL:@.+]]_wrapper(i16 0, i32 [[TID]]) +// CHECK: @__omp_offloading_{{.*}}_main_l16_exec_mode = weak constant i8 0 -// CHECK: define void @__omp_offloading_{{.*}}_main_l[[@LINE-10]](i{{64|32}} %{{[^,].*}}, i32* dereferenceable{{[^,]*}}, i{{64|32}} %{{[^,)]*}}) +// CHECK: define void @__omp_offloading_{{.*}}_main_l16(i{{64|32}} %{{[^,].*}}, i32* dereferenceable{{[^,]*}}, i{{64|32}} %{{[^,)]*}}) // CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @ -// CHECK: call void @__kmpc_kernel_init( -// CHECK: call void @__kmpc_data_sharing_init_stack() +// CHECK: call void @__kmpc_spmd_kernel_init( // CHECK: call void @__kmpc_for_static_init_4( -// CHECK: call void @__kmpc_kernel_prepare_parallel( -// CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[BUF_PTR_PTR:%[^,]+]], i{{64|32}} 4) -// CHECK: [[BUF_PTR:%.+]] = load i8**, i8*** [[BUF_PTR_PTR]], -// CHECK: [[LB:%.+]] = inttoptr i{{64|32}} [[LB_:%.*]] to i8* -// CHECK: store i8* [[LB]], i8** [[BUF_PTR]], -// CHECK: [[BUF_PTR1:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 1 -// CHECK: [[UB:%.+]] = inttoptr i{{64|32}} [[UB_:%.*]] to i8* -// CHECK: store i8* [[UB]], i8** [[BUF_PTR1]], -// CHECK: [[BUF_PTR2:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 2 -// CHECK: [[ARGC:%.+]] = inttoptr i{{64|32}} [[ARGC_:%.*]] to i8* -// CHECK: store i8* [[ARGC]], i8** [[BUF_PTR2]], -// CHECK: [[BUF_PTR3:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 3 -// CHECK: [[A_PTR:%.+]] = bitcast i32* [[A_ADDR:%.*]] to i8* -// CHECK: store i8* [[A_PTR]], i8** [[BUF_PTR3]], -// CHECK: call void @llvm.nvvm.barrier0() -// CHECK: call void @llvm.nvvm.barrier0() -// CHECK: call void @__kmpc_end_sharing_variables() -// CHECK: br label -// CHECK: call void @__kmpc_serialized_parallel(%struct.ident_t* @ -// CHECK: [[GTID_ADDR:%.*]] = load i32*, i32** % -// CHECK: call void [[PARALLEL]](i32* [[GTID_ADDR]], i32* %{{.+}}, i{{64|32}} [[LB_]], i{{64|32}} [[UB_]], i{{64|32}} [[ARGC_]], i32* [[A_ADDR]]) -// CHECK: call void @__kmpc_end_serialized_parallel(%struct.ident_t* @ +// CHECK: call void [[PARALLEL:@.+]](i32* %{{.*}}, i32* %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.*}}, i{{64|32}} %{{.*}}, i32* %{{.*}}) // CHECK: br label % // CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @ -// CHECK: call void @__kmpc_kernel_deinit(i16 1) -// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: call void @__kmpc_spmd_kernel_deinit() // CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.+}}, i32* dereferenceable{{.*}}) // CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 8, i16 0) @@ -75,24 +50,4 @@ int main(int argc, char **argv) { // CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[RES]]) -// define internal void [[PARALLEL]]_wrapper(i16 zeroext, i32) -// CHECK: call void @__kmpc_get_shared_variables(i8*** [[BUF_PTR_PTR:%.+]]) -// CHECK: [[BUF_PTR:%.+]] = load i8**, i8*** [[BUF_PTR_PTR]], -// CHECK: [[BUF_PTR0:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 0 -// CHECK: [[LB_PTR:%.+]] = bitcast i8** [[BUF_PTR0]] to i{{64|32}}* -// CHECK: [[LB:%.+]] = load i{{64|32}}, i{{64|32}}* [[LB_PTR]], -// CHECK: [[BUF_PTR1:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 1 -// CHECK: [[UB_PTR:%.+]] = bitcast i8** [[BUF_PTR1]] to i{{64|32}}* -// CHECK: [[UB:%.+]] = load i{{64|32}}, i{{64|32}}* [[UB_PTR]], -// CHECK: [[BUF_PTR2:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 2 -// CHECK: [[ARGC_ADDR:%.+]] = bitcast i8** [[BUF_PTR2]] to i32* -// CHECK: [[ARGC:%.+]] = load i32, i32* [[ARGC_ADDR]], -// CHECK-64: [[ARGC_CAST:%.+]] = zext i32 [[ARGC]] to i64 -// CHECK: [[BUF_PTR3:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 3 -// CHECK: [[A_ADDR_REF:%.+]] = bitcast i8** [[BUF_PTR3]] to i32** -// CHECK: [[A_ADDR:%.+]] = load i32*, i32** [[A_ADDR_REF]], -// CHECK-64: call void [[PARALLEL]](i32* %{{.+}}, i32* %{{.+}}, i64 [[LB]], i64 [[UB]], i64 [[ARGC_CAST]], i32* [[A_ADDR]]) -// CHECK-32: call void [[PARALLEL]](i32* %{{.+}}, i32* %{{.+}}, i32 [[LB]], i32 [[UB]], i32 [[ARGC]], i32* [[A_ADDR]]) -// CHECK: ret void - #endif -- 2.40.0