From d9c61e0b6257db414f03ae1917ff3289c215e213 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 29 Aug 2018 18:32:21 +0000 Subject: [PATCH] [OPENMP][NVPTX] Add support for lightweight runtime. If the target construct can be executed in SPMD mode + it is a loop based directive with static scheduling, we can use lightweight runtime support. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@340953 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 369 +++++++++++++++--- lib/CodeGen/CGOpenMPRuntimeNVPTX.h | 1 + .../declare_target_codegen_globalization.cpp | 16 +- test/OpenMP/nvptx_SPMD_codegen.cpp | 328 ++++++++++++++++ test/OpenMP/nvptx_target_codegen.cpp | 18 +- test/OpenMP/nvptx_target_parallel_codegen.cpp | 4 +- ...vptx_target_parallel_proc_bind_codegen.cpp | 6 +- ...vptx_target_parallel_reduction_codegen.cpp | 6 +- test/OpenMP/nvptx_target_teams_codegen.cpp | 6 +- ..._teams_distribute_parallel_for_codegen.cpp | 36 +- ...s_distribute_parallel_for_simd_codegen.cpp | 32 +- 11 files changed, 720 insertions(+), 102 deletions(-) create mode 100644 test/OpenMP/nvptx_SPMD_codegen.cpp diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 0fd261150f..14845b5862 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -672,11 +672,19 @@ static bool hasParallelIfNumThreadsClause(ASTContext &Ctx, return false; } +/// Checks if the directive is the distribute clause with the lastprivate +/// clauses. This construct does not support SPMD execution mode. +static bool hasDistributeWithLastprivateClauses(const OMPExecutableDirective &D) { + return isOpenMPDistributeDirective(D.getDirectiveKind()) && + D.hasClausesOfKind(); +} + /// Check for inner (nested) SPMD construct, if any static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D) { const auto *CS = D.getInnermostCapturedStmt(); - const auto *Body = CS->getCapturedStmt()->IgnoreContainers(); + const auto *Body = + CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); const Stmt *ChildStmt = getSingleCompoundChild(Body); if (const auto *NestedDir = dyn_cast(ChildStmt)) { @@ -684,29 +692,221 @@ static bool hasNestedSPMDDirective(ASTContext &Ctx, switch (D.getDirectiveKind()) { case OMPD_target: if (isOpenMPParallelDirective(DKind) && - !hasParallelIfNumThreadsClause(Ctx, *NestedDir)) + !hasParallelIfNumThreadsClause(Ctx, *NestedDir) && + !hasDistributeWithLastprivateClauses(*NestedDir)) return true; - if (DKind == OMPD_teams || DKind == OMPD_teams_distribute) { - Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(); + if (DKind == OMPD_teams) { + Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( + /*IgnoreCaptured=*/true); if (!Body) return false; ChildStmt = getSingleCompoundChild(Body); if (const auto *NND = dyn_cast(ChildStmt)) { DKind = NND->getDirectiveKind(); if (isOpenMPParallelDirective(DKind) && - !hasParallelIfNumThreadsClause(Ctx, *NND)) + !hasParallelIfNumThreadsClause(Ctx, *NND) && + !hasDistributeWithLastprivateClauses(*NND)) return true; - if (DKind == OMPD_distribute) { - Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(); + } + } + return false; + case OMPD_target_teams: + return isOpenMPParallelDirective(DKind) && + !hasParallelIfNumThreadsClause(Ctx, *NestedDir) && + !hasDistributeWithLastprivateClauses(*NestedDir); + case OMPD_target_simd: + case OMPD_target_parallel: + case OMPD_target_parallel_for: + case OMPD_target_parallel_for_simd: + case OMPD_target_teams_distribute: + case OMPD_target_teams_distribute_simd: + case OMPD_target_teams_distribute_parallel_for: + case OMPD_target_teams_distribute_parallel_for_simd: + case OMPD_parallel: + case OMPD_for: + case OMPD_parallel_for: + case OMPD_parallel_sections: + case OMPD_for_simd: + case OMPD_parallel_for_simd: + case OMPD_cancel: + case OMPD_cancellation_point: + case OMPD_ordered: + case OMPD_threadprivate: + case OMPD_task: + case OMPD_simd: + case OMPD_sections: + case OMPD_section: + case OMPD_single: + case OMPD_master: + case OMPD_critical: + case OMPD_taskyield: + case OMPD_barrier: + case OMPD_taskwait: + case OMPD_taskgroup: + case OMPD_atomic: + case OMPD_flush: + case OMPD_teams: + case OMPD_target_data: + case OMPD_target_exit_data: + case OMPD_target_enter_data: + case OMPD_distribute: + case OMPD_distribute_simd: + case OMPD_distribute_parallel_for: + case OMPD_distribute_parallel_for_simd: + case OMPD_teams_distribute: + case OMPD_teams_distribute_simd: + case OMPD_teams_distribute_parallel_for: + case OMPD_teams_distribute_parallel_for_simd: + case OMPD_target_update: + case OMPD_declare_simd: + case OMPD_declare_target: + case OMPD_end_declare_target: + case OMPD_declare_reduction: + case OMPD_taskloop: + case OMPD_taskloop_simd: + case OMPD_unknown: + llvm_unreachable("Unexpected directive."); + } + } + + return false; +} + +static bool supportsSPMDExecutionMode(ASTContext &Ctx, + const OMPExecutableDirective &D) { + OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); + switch (DirectiveKind) { + case OMPD_target: + case OMPD_target_teams: + return hasNestedSPMDDirective(Ctx, D); + case OMPD_target_parallel: + case OMPD_target_parallel_for: + case OMPD_target_parallel_for_simd: + return !hasParallelIfNumThreadsClause(Ctx, D); + case OMPD_target_teams_distribute_parallel_for: + case OMPD_target_teams_distribute_parallel_for_simd: + // Distribute with lastprivates requires non-SPMD execution mode. + return !hasParallelIfNumThreadsClause(Ctx, D) && + !hasDistributeWithLastprivateClauses(D); + case OMPD_target_simd: + case OMPD_target_teams_distribute: + case OMPD_target_teams_distribute_simd: + return false; + case OMPD_parallel: + case OMPD_for: + case OMPD_parallel_for: + case OMPD_parallel_sections: + case OMPD_for_simd: + case OMPD_parallel_for_simd: + case OMPD_cancel: + case OMPD_cancellation_point: + case OMPD_ordered: + case OMPD_threadprivate: + case OMPD_task: + case OMPD_simd: + case OMPD_sections: + case OMPD_section: + case OMPD_single: + case OMPD_master: + case OMPD_critical: + case OMPD_taskyield: + case OMPD_barrier: + case OMPD_taskwait: + case OMPD_taskgroup: + case OMPD_atomic: + case OMPD_flush: + case OMPD_teams: + case OMPD_target_data: + case OMPD_target_exit_data: + case OMPD_target_enter_data: + case OMPD_distribute: + case OMPD_distribute_simd: + case OMPD_distribute_parallel_for: + case OMPD_distribute_parallel_for_simd: + case OMPD_teams_distribute: + case OMPD_teams_distribute_simd: + case OMPD_teams_distribute_parallel_for: + case OMPD_teams_distribute_parallel_for_simd: + case OMPD_target_update: + case OMPD_declare_simd: + case OMPD_declare_target: + case OMPD_end_declare_target: + case OMPD_declare_reduction: + case OMPD_taskloop: + case OMPD_taskloop_simd: + case OMPD_unknown: + break; + } + llvm_unreachable( + "Unknown programming model for OpenMP directive on NVPTX target."); +} + +/// Check if the directive is loops based and has schedule clause at all or has +/// static scheduling. +static bool hasStaticScheduling(const OMPExecutableDirective &D) { + assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) && + isOpenMPLoopDirective(D.getDirectiveKind()) && + "Expected loop-based directive."); + return !D.hasClausesOfKind() && + (!D.hasClausesOfKind() || + llvm::any_of(D.getClausesOfKind(), + [](const OMPScheduleClause *C) { + return C->getScheduleKind() == OMPC_SCHEDULE_static; + })); +} + +/// Check for inner (nested) lightweight runtime construct, if any +static bool hasNestedLightweightDirective(ASTContext &Ctx, + const OMPExecutableDirective &D) { + assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive."); + const auto *CS = D.getInnermostCapturedStmt(); + const auto *Body = + CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true); + const Stmt *ChildStmt = getSingleCompoundChild(Body); + + if (const auto *NestedDir = dyn_cast(ChildStmt)) { + OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind(); + switch (D.getDirectiveKind()) { + case OMPD_target: + if (isOpenMPParallelDirective(DKind) && + isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) && + hasStaticScheduling(*NestedDir)) + return true; + if (DKind == OMPD_parallel) { + Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( + /*IgnoreCaptured=*/true); + if (!Body) + return false; + ChildStmt = getSingleCompoundChild(Body); + if (const auto *NND = dyn_cast(ChildStmt)) { + DKind = NND->getDirectiveKind(); + if (isOpenMPWorksharingDirective(DKind) && + isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) + return true; + } + } else if (DKind == OMPD_teams) { + Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( + /*IgnoreCaptured=*/true); + if (!Body) + return false; + ChildStmt = getSingleCompoundChild(Body); + if (const auto *NND = dyn_cast(ChildStmt)) { + DKind = NND->getDirectiveKind(); + if (isOpenMPParallelDirective(DKind) && + isOpenMPWorksharingDirective(DKind) && + isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) + return true; + if (DKind == OMPD_parallel) { + Body = NND->getInnermostCapturedStmt()->IgnoreContainers( + /*IgnoreCaptured=*/true); if (!Body) return false; ChildStmt = getSingleCompoundChild(Body); - if (!ChildStmt) - return false; if (const auto *NND = dyn_cast(ChildStmt)) { DKind = NND->getDirectiveKind(); - return isOpenMPParallelDirective(DKind) && - !hasParallelIfNumThreadsClause(Ctx, *NND); + if (isOpenMPWorksharingDirective(DKind) && + isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) + return true; } } } @@ -714,25 +914,28 @@ static bool hasNestedSPMDDirective(ASTContext &Ctx, return false; case OMPD_target_teams: if (isOpenMPParallelDirective(DKind) && - !hasParallelIfNumThreadsClause(Ctx, *NestedDir)) + isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) && + hasStaticScheduling(*NestedDir)) return true; - if (DKind == OMPD_distribute) { - Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(); + if (DKind == OMPD_parallel) { + Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers( + /*IgnoreCaptured=*/true); if (!Body) return false; ChildStmt = getSingleCompoundChild(Body); if (const auto *NND = dyn_cast(ChildStmt)) { DKind = NND->getDirectiveKind(); - return isOpenMPParallelDirective(DKind) && - !hasParallelIfNumThreadsClause(Ctx, *NND); + if (isOpenMPWorksharingDirective(DKind) && + isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND)) + return true; } } return false; + case OMPD_target_parallel: + return isOpenMPWorksharingDirective(DKind) && + isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir); case OMPD_target_teams_distribute: - return isOpenMPParallelDirective(DKind) && - !hasParallelIfNumThreadsClause(Ctx, *NestedDir); case OMPD_target_simd: - case OMPD_target_parallel: case OMPD_target_parallel_for: case OMPD_target_parallel_for_simd: case OMPD_target_teams_distribute_simd: @@ -788,21 +991,26 @@ static bool hasNestedSPMDDirective(ASTContext &Ctx, return false; } -static bool supportsSPMDExecutionMode(ASTContext &Ctx, - const OMPExecutableDirective &D) { +/// Checks if the construct supports lightweight runtime. It must be SPMD +/// construct + inner loop-based construct with static scheduling. +static bool supportsLightweightRuntime(ASTContext &Ctx, + const OMPExecutableDirective &D) { + if (!supportsSPMDExecutionMode(Ctx, D)) + return false; OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); switch (DirectiveKind) { case OMPD_target: case OMPD_target_teams: - case OMPD_target_teams_distribute: - return hasNestedSPMDDirective(Ctx, D); case OMPD_target_parallel: + return hasNestedLightweightDirective(Ctx, D); case OMPD_target_parallel_for: case OMPD_target_parallel_for_simd: case OMPD_target_teams_distribute_parallel_for: case OMPD_target_teams_distribute_parallel_for_simd: - return !hasParallelIfNumThreadsClause(Ctx, D); + // (Last|First)-privates must be shared in parallel region. + return hasStaticScheduling(D); case OMPD_target_simd: + case OMPD_target_teams_distribute: case OMPD_target_teams_distribute_simd: return false; case OMPD_parallel: @@ -1010,18 +1218,20 @@ void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader( EST.ExitBB = CGF.createBasicBlock(".exit"); // Initialize the OMP state in the runtime; called by all active threads. - // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters - // based on code analysis of the target region. - llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true), - /*RequiresOMPRuntime=*/Bld.getInt16(1), - /*RequiresDataSharing=*/Bld.getInt16(1)}; + bool RequiresFullRuntime = !supportsLightweightRuntime(CGF.getContext(), D); + llvm::Value *Args[] = { + getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true), + /*RequiresOMPRuntime=*/ + Bld.getInt16(RequiresFullRuntime ? 1 : 0), + /*RequiresDataSharing=*/Bld.getInt16(RequiresFullRuntime ? 1 : 0)}; CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args); - // For data sharing, we need to initialize the stack. - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd)); + if (RequiresFullRuntime) { + // For data sharing, we need to initialize the stack. + CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd)); + } CGF.EmitBranch(ExecuteBB); @@ -1414,7 +1624,8 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { /// Build void __kmpc_data_sharing_init_stack_spmd(); auto *FnTy = llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd"); + RTLFn = + CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd"); break; } case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: { @@ -1607,7 +1818,8 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( .emitGenericVarsEpilog(CGF); } } Action(Loc); - CodeGen.setAction(Action); + if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD) + CodeGen.setAction(Action); llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction( D, ThreadIDVar, InnermostKind, CodeGen); llvm::Function *OutlinedFun = cast(OutlinedFunVal); @@ -1640,19 +1852,61 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, unsigned GlobalRecordSize = CGM.getContext().getTypeSizeInChars(RecTy).getQuantity(); GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment); - // TODO: allow the usage of shared memory to be controlled by - // the user, for now, default to global. - llvm::Value *GlobalRecordSizeArg[] = { - llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), - CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; - llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack), - GlobalRecordSizeArg); - llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( - GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo()); + + llvm::Value *GlobalRecCastAddr; + if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) { + llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); + llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd"); + llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); + llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode))); + Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB); + // There is no need to emit line number for unconditional branch. + (void)ApplyDebugLocation::CreateEmpty(CGF); + CGF.EmitBlock(SPMDBB); + Address RecPtr = CGF.CreateMemTemp(RecTy, "_local_stack"); + CGF.EmitBranch(ExitBB); + // There is no need to emit line number for unconditional branch. + (void)ApplyDebugLocation::CreateEmpty(CGF); + CGF.EmitBlock(NonSPMDBB); + // TODO: allow the usage of shared memory to be controlled by + // the user, for now, default to global. + llvm::Value *GlobalRecordSizeArg[] = { + llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), + CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; + llvm::Value *GlobalRecValue = + CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_data_sharing_push_stack), + GlobalRecordSizeArg); + GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( + GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo()); + CGF.EmitBlock(ExitBB); + auto *Phi = Bld.CreatePHI(GlobalRecCastAddr->getType(), + /*NumReservedValues=*/2, "_select_stack"); + Phi->addIncoming(RecPtr.getPointer(), SPMDBB); + Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB); + GlobalRecCastAddr = Phi; + I->getSecond().GlobalRecordAddr = Phi; + I->getSecond().IsInSPMDModeFlag = IsSPMD; + } else { + assert(getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_NonSPMD && + "Expected Non-SPMD construct."); + // TODO: allow the usage of shared memory to be controlled by + // the user, for now, default to global. + llvm::Value *GlobalRecordSizeArg[] = { + llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), + CGF.Builder.getInt16(/*UseSharedMemory=*/0)}; + llvm::Value *GlobalRecValue = + CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_data_sharing_push_stack), + GlobalRecordSizeArg); + GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( + GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo()); + I->getSecond().GlobalRecordAddr = GlobalRecValue; + I->getSecond().IsInSPMDModeFlag = nullptr; + } LValue Base = CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy); - I->getSecond().GlobalRecordAddr = GlobalRecValue; // Emit the "global alloca" which is a GEP from the global declaration // record using the pointer returned by the runtime. @@ -1724,9 +1978,26 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF) { Addr); } if (I->getSecond().GlobalRecordAddr) { - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), - I->getSecond().GlobalRecordAddr); + if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) { + CGBuilderTy &Bld = CGF.Builder; + llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); + llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd"); + Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB); + // There is no need to emit line number for unconditional branch. + (void)ApplyDebugLocation::CreateEmpty(CGF); + CGF.EmitBlock(NonSPMDBB); + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), + CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr)); + CGF.EmitBlock(ExitBB); + } else { + assert(getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_NonSPMD && + "Expected Non-SPMD mode."); + CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_data_sharing_pop_stack), + I->getSecond().GlobalRecordAddr); + } } } } diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h index f83e99f8a3..0975eb0641 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -374,6 +374,7 @@ private: llvm::SmallVector EscapedVariableLengthDeclsAddrs; const RecordDecl *GlobalRecord = nullptr; llvm::Value *GlobalRecordAddr = nullptr; + llvm::Value *IsInSPMDModeFlag = nullptr; std::unique_ptr MappedParams; }; /// Maps the function to the list of the globalized variables with their diff --git a/test/OpenMP/declare_target_codegen_globalization.cpp b/test/OpenMP/declare_target_codegen_globalization.cpp index 7ef4f8af3d..6b08993c19 100644 --- a/test/OpenMP/declare_target_codegen_globalization.cpp +++ b/test/OpenMP/declare_target_codegen_globalization.cpp @@ -35,9 +35,19 @@ int maini1() { // CHECK-NOT: @__kmpc_data_sharing_push_stack // CHECK: define {{.*}}[[BAR]]() +// CHECK: [[STACK:%.+]] = alloca [[GLOBAL_ST:%.+]], +// CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() +// CHECK: [[IS_SPMD:%.+]] = icmp ne i8 [[RES]], 0 +// CHECK: br i1 [[IS_SPMD]], label +// CHECK: br label // CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 0) -// CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_ST:%struct[.].*]]* -// CHECK: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[GLOBALS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 +// CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_ST]]* +// CHECK: br label +// CHECK: [[ITEMS:%.+]] = phi [[GLOBAL_ST]]* [ [[STACK]], {{.+}} ], [ [[GLOBALS]], {{.+}} ] +// CHECK: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[ITEMS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CHECK: call {{.*}}[[FOO]](i32* dereferenceable{{.*}} [[A_ADDR]]) -// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[RES]]) +// CHECK: br i1 [[IS_SPMD]], label +// CHECK: [[BC:%.+]] = bitcast [[GLOBAL_ST]]* [[ITEMS]] to i8* +// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[BC]]) +// CHECK: br label // CHECK: ret i32 diff --git a/test/OpenMP/nvptx_SPMD_codegen.cpp b/test/OpenMP/nvptx_SPMD_codegen.cpp new file mode 100644 index 0000000000..615dc30bdc --- /dev/null +++ b/test/OpenMP/nvptx_SPMD_codegen.cpp @@ -0,0 +1,328 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1 +// CHECK: @__omp_offloading_{{.+}}_l52_exec_mode = weak constant i8 1 +// CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1 + +void foo() { +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams distribute parallel for simd schedule(static) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams distribute parallel for simd schedule(static, 1) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams distribute parallel for simd schedule(auto) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams distribute parallel for simd schedule(runtime) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams distribute parallel for simd schedule(dynamic) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams distribute parallel for simd schedule(guided) + for (int i = 0; i < 10; ++i) + ; +int a; +// CHECK: call void @__kmpc_kernel_init( +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +#pragma omp target teams distribute parallel for lastprivate(a) + for (int i = 0; i < 10; ++i) + a = i; +#pragma omp target teams distribute parallel for schedule(static) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams distribute parallel for schedule(static, 1) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams distribute parallel for schedule(auto) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams distribute parallel for schedule(runtime) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams distribute parallel for schedule(dynamic) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams distribute parallel for schedule(guided) + for (int i = 0; i < 10; ++i) + ; +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +#pragma omp target teams +#pragma omp distribute parallel for simd + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams +#pragma omp distribute parallel for simd schedule(static) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams +#pragma omp distribute parallel for simd schedule(static, 1) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams +#pragma omp distribute parallel for simd schedule(auto) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams +#pragma omp distribute parallel for simd schedule(runtime) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams +#pragma omp distribute parallel for simd schedule(dynamic) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams +#pragma omp distribute parallel for simd schedule(guided) + for (int i = 0; i < 10; ++i) + ; +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +#pragma omp target teams +#pragma omp distribute parallel for + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams +#pragma omp distribute parallel for schedule(static) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams +#pragma omp distribute parallel for schedule(static, 1) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams +#pragma omp distribute parallel for schedule(auto) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams +#pragma omp distribute parallel for schedule(runtime) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams +#pragma omp distribute parallel for schedule(dynamic) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target teams +#pragma omp distribute parallel for schedule(guided) + for (int i = 0; i < 10; ++i) + ; +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +#pragma omp target +#pragma omp teams +#pragma omp distribute parallel for + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp teams +#pragma omp distribute parallel for schedule(static) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp teams +#pragma omp distribute parallel for schedule(static, 1) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp teams +#pragma omp distribute parallel for schedule(auto) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp teams +#pragma omp distribute parallel for schedule(runtime) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp teams +#pragma omp distribute parallel for schedule(dynamic) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp teams +#pragma omp distribute parallel for schedule(guided) + for (int i = 0; i < 10; ++i) + ; +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +#pragma omp target parallel for + for (int i = 0; i < 10; ++i) + ; +#pragma omp target parallel for schedule(static) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target parallel for schedule(static, 1) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target parallel for schedule(auto) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target parallel for schedule(runtime) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target parallel for schedule(dynamic) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target parallel for schedule(guided) + for (int i = 0; i < 10; ++i) + ; +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +#pragma omp target parallel +#pragma omp for simd + for (int i = 0; i < 10; ++i) + ; +#pragma omp target parallel +#pragma omp for simd schedule(static) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target parallel +#pragma omp for simd schedule(static, 1) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target parallel +#pragma omp for simd schedule(auto) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target parallel +#pragma omp for simd schedule(runtime) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target parallel +#pragma omp for simd schedule(dynamic) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target parallel +#pragma omp for simd schedule(guided) + for (int i = 0; i < 10; ++i) + ; +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +#pragma omp target +#pragma omp parallel +#pragma omp for simd ordered + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp parallel +#pragma omp for simd schedule(static) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp parallel +#pragma omp for simd schedule(static, 1) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp parallel +#pragma omp for simd schedule(auto) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp parallel +#pragma omp for simd schedule(runtime) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp parallel +#pragma omp for simd schedule(dynamic) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp parallel +#pragma omp for simd schedule(guided) + for (int i = 0; i < 10; ++i) + ; +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 0, i16 0) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 {{.+}}) +#pragma omp target +#pragma omp parallel for + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp parallel for schedule(static) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp parallel for schedule(static, 1) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp parallel for schedule(auto) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp parallel for schedule(runtime) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < 10; ++i) + ; +#pragma omp target +#pragma omp parallel for schedule(guided) + for (int i = 0; i < 10; ++i) + ; +} + +#endif + diff --git a/test/OpenMP/nvptx_target_codegen.cpp b/test/OpenMP/nvptx_target_codegen.cpp index 5f9b3bd328..e404978290 100644 --- a/test/OpenMP/nvptx_target_codegen.cpp +++ b/test/OpenMP/nvptx_target_codegen.cpp @@ -554,13 +554,20 @@ int baz(int f, double &a) { // CHECK: ret void // CHECK: define i32 [[BAZ]](i32 [[F:%.*]], double* dereferenceable{{.*}}) + // CHECK: [[STACK:%.+]] = alloca [[GLOBAL_ST:%.+]], // CHECK: [[ZERO_ADDR:%.+]] = alloca i32, // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* // CHECK: [[GTID_ADDR:%.+]] = alloca i32, // CHECK: store i32 0, i32* [[ZERO_ADDR]] + // CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() + // CHECK: [[IS_SPMD:%.+]] = icmp ne i8 [[RES]], 0 + // CHECK: br i1 [[IS_SPMD]], label + // CHECK: br label // CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 4, i16 0) - // CHECK: [[REC_ADDR:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty* - // CHECK: [[F_PTR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[REC_ADDR]], i32 0, i32 0 + // CHECK: [[REC_ADDR:%.+]] = bitcast i8* [[PTR]] to [[GLOBAL_ST]]* + // CHECK: br label + // CHECK: [[ITEMS:%.+]] = phi [[GLOBAL_ST]]* [ [[STACK]], {{.+}} ], [ [[REC_ADDR]], {{.+}} ] + // CHECK: [[F_PTR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[ITEMS]], i32 0, i32 0 // CHECK: store i32 %{{.+}}, i32* [[F_PTR]], // CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() @@ -595,7 +602,12 @@ int baz(int f, double &a) { // CHECK: br label // CHECK: [[RES:%.+]] = load i32, i32* [[F_PTR]], - // CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[PTR]]) + // CHECK: store i32 [[RES]], i32* [[RET:%.+]], + // CHECK: br i1 [[IS_SPMD]], label + // CHECK: [[BC:%.+]] = bitcast [[GLOBAL_ST]]* [[ITEMS]] to i8* + // CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[BC]]) + // CHECK: br label + // CHECK: [[RES:%.+]] = load i32, i32* [[RET]], // CHECK: ret i32 [[RES]] diff --git a/test/OpenMP/nvptx_target_parallel_codegen.cpp b/test/OpenMP/nvptx_target_parallel_codegen.cpp index 15611f6475..6fccfbed56 100644 --- a/test/OpenMP/nvptx_target_parallel_codegen.cpp +++ b/test/OpenMP/nvptx_target_parallel_codegen.cpp @@ -59,7 +59,7 @@ int bar(int n){ // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], + // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 1) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // @@ -102,7 +102,7 @@ int bar(int n){ // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], + // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 1) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // diff --git a/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp b/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp index 1cdcb70417..163679d92b 100644 --- a/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp +++ b/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp @@ -47,7 +47,7 @@ int bar(int n){ } // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l22}}( - // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 1) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // @@ -69,7 +69,7 @@ int bar(int n){ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}( - // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 1) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // @@ -90,7 +90,7 @@ int bar(int n){ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l31}}( - // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 1) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXEC:.+]] // diff --git a/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp index 02676272b0..a3790f2851 100644 --- a/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp +++ b/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp @@ -54,7 +54,7 @@ int bar(int n){ // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}( // - // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 1) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXECUTE:.+]] // @@ -242,7 +242,7 @@ int bar(int n){ // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}( // - // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 1) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXECUTE:.+]] // @@ -520,7 +520,7 @@ int bar(int n){ // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}( // - // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 1) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: br label {{%?}}[[EXECUTE:.+]] // diff --git a/test/OpenMP/nvptx_target_teams_codegen.cpp b/test/OpenMP/nvptx_target_teams_codegen.cpp index c7667d83b9..5d3088d019 100644 --- a/test/OpenMP/nvptx_target_teams_codegen.cpp +++ b/test/OpenMP/nvptx_target_teams_codegen.cpp @@ -227,13 +227,13 @@ int bar(int n){ // CHECK: ret void // CHECK: define weak void @__omp_offloading_{{.*}}ftemplate{{.*}}_l37( -// CHECK: call void @__kmpc_spmd_kernel_init( +// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1, i16 1) // CHECK: call void @__kmpc_data_sharing_init_stack_spmd -// CHECK: call i8* @__kmpc_data_sharing_push_stack( +// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack( // CHECK-NOT: call void @__kmpc_serialized_parallel( // CHECK: call void [[L0:@.+]](i32* %{{.+}}, i32* %{{.+}}, i16* %{{.*}}) // CHECK-NOT: call void @__kmpc_end_serialized_parallel( -// CHECK: call void @__kmpc_data_sharing_pop_stack( +// CHECK-NOT: call void @__kmpc_data_sharing_pop_stack( // CHECK: call void @__kmpc_spmd_kernel_deinit() // CHECK: ret diff --git a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp index f74919fc82..bbcb19d77d 100644 --- a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -8,12 +8,13 @@ #ifndef HEADER #define HEADER -// Check that the execution mode of all 2 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l48}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l56}}_exec_mode = weak constant i8 0 +// Check that the execution mode of the target region with lastprivates on the gpu is set to Non-SPMD Mode. +// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 1 +// Check that the execution mode of all 4 target regions on the gpu is set to SPMD Mode. +// CHECK-DAG: {{@__omp_offloading_.+l39}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l44}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l49}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l57}}_exec_mode = weak constant i8 0 #define N 1000 #define M 10 @@ -67,14 +68,14 @@ int bar(int n){ return a; } -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( -// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd +// CHECK_LABEL: define internal void @__omp_offloading_{{.+}}_l33_worker() + +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l33( +// CHECK: call void @__kmpc_kernel_init(i32 %{{.+}}, i16 1) // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, -// CHECK: {{call|invoke}} void [[OUTL1:@.+]]( +// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[OUTL1:@__omp_outlined.*]]_wrapper to i8*), i16 1) // CHECK: call void @__kmpc_for_static_fini( -// CHECK: call void @__kmpc_spmd_kernel_deinit() +// CHECK: call void @__kmpc_kernel_deinit(i16 1) // CHECK: ret void // CHECK: define internal void [[OUTL1]]( @@ -84,8 +85,7 @@ int bar(int n){ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92, // CHECK: {{call|invoke}} void [[OUTL2:@.+]]( // CHECK: call void @__kmpc_for_static_fini( @@ -99,8 +99,7 @@ int bar(int n){ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92, // CHECK: {{call|invoke}} void [[OUTL3:@.+]]( // CHECK: call void @__kmpc_for_static_fini( @@ -115,8 +114,7 @@ int bar(int n){ // CHECK: define {{.*}}void {{@__omp_offloading_.+}}({{.+}}, i{{32|64}} [[F_IN:%.+]]) // CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}}, // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]], // CHECK: {{call|invoke}} void [[OUTL4:@.+]]( @@ -129,7 +127,7 @@ int bar(int n){ // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void -// CHECK: define weak void @__omp_offloading_{{.*}}_l56(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* dereferenceable{{.*}}, i32* %{{[^)]+}}) +// CHECK: define weak void @__omp_offloading_{{.*}}_l57(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* dereferenceable{{.*}}, i32* %{{[^)]+}}) // CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [1000 x i32]* %{{.*}}, i32* %{{.*}}) // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [1000 x i32]* dereferenceable{{.*}}, i32* %{{.*}}) diff --git a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp index 0b945a69b0..86768b04ae 100644 --- a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp +++ b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp @@ -8,11 +8,12 @@ #ifndef HEADER #define HEADER -// Check that the execution mode of all 2 target regions on the gpu is set to SPMD Mode. -// CHECK-DAG: {{@__omp_offloading_.+l30}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l36}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l41}}_exec_mode = weak constant i8 0 -// CHECK-DAG: {{@__omp_offloading_.+l46}}_exec_mode = weak constant i8 0 +// Check that the execution mode of the target region with lastprivates on the gpu is set to Non-SPMD Mode. +// CHECK-DAG: {{@__omp_offloading_.+l31}}_exec_mode = weak constant i8 1 +// Check that the execution mode of all 3 target regions on the gpu is set to SPMD Mode. +// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l42}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l47}}_exec_mode = weak constant i8 0 #define N 1000 #define M 10 @@ -62,14 +63,14 @@ int bar(int n){ return a; } -// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( -// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd +// CHECK_LABEL: define internal void @__omp_offloading_{{.+}}_l31_worker() + +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l31( +// CHECK: call void @__kmpc_kernel_init(i32 %{{.+}}, i16 1) // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, -// CHECK: {{call|invoke}} void [[OUTL1:@.+]]( +// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[OUTL1:@__omp_outlined.*]]_wrapper to i8*), i16 1) // CHECK: call void @__kmpc_for_static_fini( -// CHECK: call void @__kmpc_spmd_kernel_deinit() +// CHECK: call void @__kmpc_kernel_deinit(i16 1) // CHECK: ret void // CHECK: define internal void [[OUTL1]]( @@ -79,8 +80,7 @@ int bar(int n){ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92, // CHECK: {{call|invoke}} void [[OUTL2:@.+]]( // CHECK: call void @__kmpc_for_static_fini( @@ -94,8 +94,7 @@ int bar(int n){ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92, // CHECK: {{call|invoke}} void [[OUTL3:@.+]]( // CHECK: call void @__kmpc_for_static_fini( @@ -110,8 +109,7 @@ int bar(int n){ // CHECK: define {{.*}}void {{@__omp_offloading_.+}}({{.+}}, i{{32|64}} [[F_IN:%.+]]) // CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}}, // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() -// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd +// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) // CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 92, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]], // CHECK: {{call|invoke}} void [[OUTL4:@.+]]( -- 2.40.0