From d407903caa09a280004a43f523734a6408b9fff1 Mon Sep 17 00:00:00 2001 From: Arpith Chacko Jacob Date: Thu, 5 Jan 2017 15:24:05 +0000 Subject: [PATCH] [OpenMP] Update target codegen for NVPTX device. This patch includes updates for codegen of the target region for the NVPTX device. It moves initializers from the compiler to the runtime and updates the worker loop to assume parallel work is retrieved from the runtime. A subsequent patch will update the codegen to retrieve the parallel work using calls to the runtime. It includes the removal of the inline attribute for the worker loop and disabling debug info in it. This allows codegen for a target directive and serial execution on the NVPTX device. Reviewers: ABataev Differential Revision: https://reviews.llvm.org/D28125 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@291121 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 268 ++++++++++---------- lib/CodeGen/CGOpenMPRuntimeNVPTX.h | 47 ++-- test/OpenMP/nvptx_target_codegen.cpp | 354 +++++++++++++++------------ 3 files changed, 346 insertions(+), 323 deletions(-) diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index fe0e2acdfd..942cae6727 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -22,14 +22,10 @@ using namespace CodeGen; namespace { enum OpenMPRTLFunctionNVPTX { - /// \brief Call to void __kmpc_kernel_init(kmp_int32 omp_handle, - /// kmp_int32 thread_limit); + /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit); OMPRTL_NVPTX__kmpc_kernel_init, -}; - -// NVPTX Address space -enum AddressSpace { - AddressSpaceShared = 3, + /// \brief Call to void __kmpc_kernel_deinit(); + OMPRTL_NVPTX__kmpc_kernel_deinit, }; } // namespace @@ -70,6 +66,15 @@ static void getNVPTXCTABarrier(CodeGenFunction &CGF) { /// Synchronize all GPU threads in a block. static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); } +/// Get the value of the thread_limit clause in the teams directive. +/// The runtime encodes thread_limit in the launch parameter, always starting +/// thread_limit+warpSize threads per team. +static llvm::Value *getThreadLimit(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + return Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF), + "thread_limit"); +} + /// Get the thread id of the OMP master thread. /// The master thread id is the first thread (lane) of the last warp in the /// GPU block. Warp size is assumed to be some power of 2. @@ -103,35 +108,105 @@ void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction( CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage, /* placeholder */ "_worker", &CGM.getModule()); CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI); - WorkerFn->setLinkage(llvm::GlobalValue::InternalLinkage); - WorkerFn->addFnAttr(llvm::Attribute::NoInline); } -void CGOpenMPRuntimeNVPTX::initializeEnvironment() { - // - // Initialize master-worker control state in shared memory. - // +void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) { + EntryFunctionState EST; + WorkerFunctionState WST(CGM); + + // Emit target region as a standalone region. + class NVPTXPrePostActionTy : public PrePostActionTy { + CGOpenMPRuntimeNVPTX &RT; + CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; + CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST; + + public: + NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT, + CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, + CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST) + : RT(RT), EST(EST), WST(WST) {} + void Enter(CodeGenFunction &CGF) override { + RT.emitGenericEntryHeader(CGF, EST, WST); + } + void Exit(CodeGenFunction &CGF) override { + RT.emitGenericEntryFooter(CGF, EST); + } + } Action(*this, EST, WST); + CodeGen.setAction(Action); + emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, + IsOffloadEntry, CodeGen); - auto DL = CGM.getDataLayout(); - ActiveWorkers = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int32Ty, /*isConstant=*/false, - llvm::GlobalValue::CommonLinkage, - llvm::Constant::getNullValue(CGM.Int32Ty), "__omp_num_threads", 0, - llvm::GlobalVariable::NotThreadLocal, AddressSpaceShared); - ActiveWorkers->setAlignment(DL.getPrefTypeAlignment(CGM.Int32Ty)); - - WorkID = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int64Ty, /*isConstant=*/false, - llvm::GlobalValue::CommonLinkage, - llvm::Constant::getNullValue(CGM.Int64Ty), "__tgt_work_id", 0, - llvm::GlobalVariable::NotThreadLocal, AddressSpaceShared); - WorkID->setAlignment(DL.getPrefTypeAlignment(CGM.Int64Ty)); + // Create the worker function + emitWorkerFunction(WST); + + // Now change the name of the worker function to correspond to this target + // region's entry function. + WST.WorkerFn->setName(OutlinedFn->getName() + "_worker"); +} + +// Setup NVPTX threads for master-worker OpenMP scheme. +void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF, + EntryFunctionState &EST, + WorkerFunctionState &WST) { + CGBuilderTy &Bld = CGF.Builder; + + llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker"); + llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck"); + llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); + EST.ExitBB = CGF.createBasicBlock(".exit"); + + auto *IsWorker = + Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF)); + Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB); + + CGF.EmitBlock(WorkerBB); + CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None); + CGF.EmitBranch(EST.ExitBB); + + CGF.EmitBlock(MasterCheckBB); + auto *IsMaster = + Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF)); + Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); + + CGF.EmitBlock(MasterBB); + // First action in sequential region: + // Initialize the state of the OpenMP runtime library on the GPU. + llvm::Value *Args[] = {getThreadLimit(CGF)}; + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args); +} + +void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF, + EntryFunctionState &EST) { + if (!EST.ExitBB) + EST.ExitBB = CGF.createBasicBlock(".exit"); + + llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier"); + CGF.EmitBranch(TerminateBB); + + CGF.EmitBlock(TerminateBB); + // Signal termination condition. + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), None); + // Barrier to terminate worker threads. + syncCTAThreads(CGF); + // Master thread jumps to exit point. + CGF.EmitBranch(EST.ExitBB); + + CGF.EmitBlock(EST.ExitBB); + EST.ExitBB = nullptr; } void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) { auto &Ctx = CGM.getContext(); CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); + CGF.disableDebugInfo(); CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {}); emitWorkerLoop(CGF, WST); CGF.FinishFunction(); @@ -163,21 +238,26 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, CGF.EmitBlock(AwaitBB); // Wait for parallel work syncCTAThreads(CGF); + + Address WorkFn = + CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn"); + Address ExecStatus = + CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status"); + CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0)); + CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy)); + + // TODO: Call into runtime to get parallel work. + // On termination condition (workid == 0), exit loop. - llvm::Value *ShouldTerminate = Bld.CreateICmpEQ( - Bld.CreateAlignedLoad(WorkID, WorkID->getAlignment()), - llvm::Constant::getNullValue(WorkID->getType()->getElementType()), - "should_terminate"); + llvm::Value *ShouldTerminate = + Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate"); Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB); // Activate requested workers. CGF.EmitBlock(SelectWorkersBB); - llvm::Value *ThreadID = getNVPTXThreadID(CGF); - llvm::Value *ActiveThread = Bld.CreateICmpSLT( - ThreadID, - Bld.CreateAlignedLoad(ActiveWorkers, ActiveWorkers->getAlignment()), - "active_thread"); - Bld.CreateCondBr(ActiveThread, ExecuteBB, BarrierBB); + llvm::Value *IsActive = + Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active"); + Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB); // Signal start of parallel region. CGF.EmitBlock(ExecuteBB); @@ -197,72 +277,6 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, CGF.EmitBlock(ExitBB); } -// Setup NVPTX threads for master-worker OpenMP scheme. -void CGOpenMPRuntimeNVPTX::emitEntryHeader(CodeGenFunction &CGF, - EntryFunctionState &EST, - WorkerFunctionState &WST) { - CGBuilderTy &Bld = CGF.Builder; - - // Get the master thread id. - llvm::Value *MasterID = getMasterThreadID(CGF); - // Current thread's identifier. - llvm::Value *ThreadID = getNVPTXThreadID(CGF); - - // Setup BBs in entry function. - llvm::BasicBlock *WorkerCheckBB = CGF.createBasicBlock(".check.for.worker"); - llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker"); - llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); - EST.ExitBB = CGF.createBasicBlock(".exit"); - - // The head (master thread) marches on while its body of companion threads in - // the warp go to sleep. - llvm::Value *ShouldDie = - Bld.CreateICmpUGT(ThreadID, MasterID, "excess_in_master_warp"); - Bld.CreateCondBr(ShouldDie, EST.ExitBB, WorkerCheckBB); - - // Select worker threads... - CGF.EmitBlock(WorkerCheckBB); - llvm::Value *IsWorker = Bld.CreateICmpULT(ThreadID, MasterID, "is_worker"); - Bld.CreateCondBr(IsWorker, WorkerBB, MasterBB); - - // ... and send to worker loop, awaiting parallel invocation. - CGF.EmitBlock(WorkerBB); - CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None); - CGF.EmitBranch(EST.ExitBB); - - // Only master thread executes subsequent serial code. - CGF.EmitBlock(MasterBB); - - // First action in sequential region: - // Initialize the state of the OpenMP runtime library on the GPU. - llvm::Value *Args[] = {Bld.getInt32(/*OmpHandle=*/0), getNVPTXThreadID(CGF)}; - CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), - Args); -} - -void CGOpenMPRuntimeNVPTX::emitEntryFooter(CodeGenFunction &CGF, - EntryFunctionState &EST) { - if (!EST.ExitBB) - EST.ExitBB = CGF.createBasicBlock(".exit"); - - CGBuilderTy &Bld = CGF.Builder; - llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier"); - CGF.EmitBranch(TerminateBB); - - CGF.EmitBlock(TerminateBB); - // Signal termination condition. - Bld.CreateAlignedStore( - llvm::Constant::getNullValue(WorkID->getType()->getElementType()), WorkID, - WorkID->getAlignment()); - // Barrier to terminate worker threads. - syncCTAThreads(CGF); - // Master thread jumps to exit point. - CGF.EmitBranch(EST.ExitBB); - - CGF.EmitBlock(EST.ExitBB); - EST.ExitBB = nullptr; -} - /// \brief Returns specified OpenMP runtime function for the current OpenMP /// implementation. Specialized for the NVPTX device. /// \param Function OpenMP runtime function. @@ -272,14 +286,20 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { llvm::Constant *RTLFn = nullptr; switch (static_cast(Function)) { case OMPRTL_NVPTX__kmpc_kernel_init: { - // Build void __kmpc_kernel_init(kmp_int32 omp_handle, - // kmp_int32 thread_limit); - llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int32Ty}; + // Build void __kmpc_kernel_init(kmp_int32 thread_limit); + llvm::Type *TypeParams[] = {CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init"); break; } + case OMPRTL_NVPTX__kmpc_kernel_deinit: { + // Build void __kmpc_kernel_deinit(); + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, {}, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit"); + break; + } } return RTLFn; } @@ -315,44 +335,14 @@ void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction( assert(!ParentName.empty() && "Invalid target region parent name!"); - EntryFunctionState EST; - WorkerFunctionState WST(CGM); - - // Emit target region as a standalone region. - class NVPTXPrePostActionTy : public PrePostActionTy { - CGOpenMPRuntimeNVPTX &RT; - CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; - CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST; - - public: - NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT, - CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, - CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST) - : RT(RT), EST(EST), WST(WST) {} - void Enter(CodeGenFunction &CGF) override { - RT.emitEntryHeader(CGF, EST, WST); - } - void Exit(CodeGenFunction &CGF) override { RT.emitEntryFooter(CGF, EST); } - } Action(*this, EST, WST); - CodeGen.setAction(Action); - emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, - IsOffloadEntry, CodeGen); - - // Create the worker function - emitWorkerFunction(WST); - - // Now change the name of the worker function to correspond to this target - // region's entry function. - WST.WorkerFn->setName(OutlinedFn->getName() + "_worker"); + emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); } CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) - : CGOpenMPRuntime(CGM), ActiveWorkers(nullptr), WorkID(nullptr) { + : CGOpenMPRuntime(CGM) { if (!CGM.getLangOpts().OpenMPIsDevice) llvm_unreachable("OpenMP NVPTX can only handle device code."); - - // Called once per module during initialization. - initializeEnvironment(); } void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF, diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h index a33fb27579..723f7f8882 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -24,7 +24,7 @@ namespace clang { namespace CodeGen { class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime { -public: +private: struct EntryFunctionState { llvm::BasicBlock *ExitBB = nullptr; }; @@ -40,34 +40,21 @@ public: void createWorkerFunction(CodeGenModule &CGM); }; - /// \brief Helper for target entry function. Guide the master and worker - /// threads to their respective locations. - void emitEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, - WorkerFunctionState &WST); - - /// \brief Signal termination of OMP execution. - void emitEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); - -private: - // - // Private state and methods. - // - - // Master-worker control state. - // Number of requested OMP threads in parallel region. - llvm::GlobalVariable *ActiveWorkers; - // Outlined function for the workers to execute. - llvm::GlobalVariable *WorkID; - - /// \brief Initialize master-worker control state. - void initializeEnvironment(); - /// \brief Emit the worker function for the current target region. void emitWorkerFunction(WorkerFunctionState &WST); /// \brief Helper for worker function. Emit body of worker loop. void emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST); + /// \brief Helper for generic target entry function. Guide the master and + /// worker threads to their respective locations. + void emitGenericEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, + WorkerFunctionState &WST); + + /// \brief Signal termination of OMP execution for generic target entry + /// function. + void emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); + /// \brief Returns specified OpenMP runtime function for the current OpenMP /// implementation. Specialized for the NVPTX device. /// \param Function OpenMP runtime function. @@ -83,6 +70,20 @@ private: void createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size) override; + /// \brief Emit outlined function specialized for the Fork-Join + /// programming model for applicable target directives on the NVPTX device. + /// \param D Directive to emit. + /// \param ParentName Name of the function that encloses the target region. + /// \param OutlinedFn Outlined function value to be defined by this call. + /// \param OutlinedFnID Outlined function ID value to be defined by this call. + /// \param IsOffloadEntry True if the outlined function is an offload entry. + /// An outlined function may not be an entry if, e.g. the if clause always + /// evaluates to false. + void emitGenericKernel(const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen); + /// \brief Emit outlined function for 'target' directive on the NVPTX /// device. /// \param D Directive to emit. diff --git a/test/OpenMP/nvptx_target_codegen.cpp b/test/OpenMP/nvptx_target_codegen.cpp index 287089d7c4..59c4d5b277 100644 --- a/test/OpenMP/nvptx_target_codegen.cpp +++ b/test/OpenMP/nvptx_target_codegen.cpp @@ -8,9 +8,6 @@ #ifndef HEADER #define HEADER -// CHECK-DAG: [[OMP_NT:@.+]] = common addrspace(3) global i32 0 -// CHECK-DAG: [[OMP_WID:@.+]] = common addrspace(3) global i64 0 - template struct TT{ tx X; @@ -26,19 +23,22 @@ int foo(int n) { double cn[5][n]; TT d; - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l87}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l90}}_worker() + // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, + // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, + // CHECK: store i8* null, i8** [[OMP_WORK_FN]], + // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], // CHECK: br label {{%?}}[[AWAIT_WORK:.+]] // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]], - // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0 + // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], + // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] // // CHECK: [[SEL_WORKERS]] - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]] - // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]] + // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]], + // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] // // CHECK: [[EXEC_PARALLEL]] @@ -54,31 +54,34 @@ int foo(int n) { // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l87]]() - // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[A:%.+]] = sub i32 [[WS]], 1 - // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1 - // CHECK: [[MID:%.+]] = and i32 [[B]], - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]] - // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]] - // - // CHECK: [[CHECK_WORKER]] - // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]] - // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]] + // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l90]]() + // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]] + // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] + // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] // // CHECK: [[WORKER]] // CHECK: {{call|invoke}} void [[T1]]_worker() - // CHECK: br label {{%?}}[[EXIT]] + // CHECK: br label {{%?}}[[EXIT:.+]] // - // CHECK: [[MASTER]] - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]]) - // CHECK: br label {{%?}}[[TERM:.+]] + // CHECK: [[CHECK_MASTER]] + // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], + // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] // - // CHECK: [[TERM]] - // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]], + // CHECK: [[MASTER]] + // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] + // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] + // CHECK: br label {{%?}}[[TERMINATE:.+]] + // + // CHECK: [[TERMINATE]] + // CHECK: call void @__kmpc_kernel_deinit() // CHECK: call void @llvm.nvvm.barrier0() // CHECK: br label {{%?}}[[EXIT]] // @@ -93,19 +96,22 @@ int foo(int n) { { } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l158}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l167}}_worker() + // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, + // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, + // CHECK: store i8* null, i8** [[OMP_WORK_FN]], + // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], // CHECK: br label {{%?}}[[AWAIT_WORK:.+]] // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]], - // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0 + // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], + // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] // // CHECK: [[SEL_WORKERS]] - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]] - // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]] + // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]], + // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] // // CHECK: [[EXEC_PARALLEL]] @@ -121,35 +127,38 @@ int foo(int n) { // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l158]](i[[SZ:32|64]] [[ARG1:%[^)]+]]) + // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l167]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]]) // CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]], // CHECK: store i[[SZ]] [[ARG1]], i[[SZ]]* [[AA_ADDR]], // CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16* - // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[A:%.+]] = sub i32 [[WS]], 1 - // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1 - // CHECK: [[MID:%.+]] = and i32 [[B]], - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]] - // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]] - // - // CHECK: [[CHECK_WORKER]] - // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]] - // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]] + // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]] + // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] + // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] // // CHECK: [[WORKER]] - // CHECK: {{call|invoke}} void [[T3]]_worker() - // CHECK: br label {{%?}}[[EXIT]] + // CHECK: {{call|invoke}} void [[T2]]_worker() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK: [[CHECK_MASTER]] + // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], + // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] // // CHECK: [[MASTER]] - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]]) + // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] + // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] // CHECK: load i16, i16* [[AA_CADDR]], - // CHECK: br label {{%?}}[[TERM:.+]] + // CHECK: br label {{%?}}[[TERMINATE:.+]] // - // CHECK: [[TERM]] - // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]], + // CHECK: [[TERMINATE]] + // CHECK: call void @__kmpc_kernel_deinit() // CHECK: call void @llvm.nvvm.barrier0() // CHECK: br label {{%?}}[[EXIT]] // @@ -160,19 +169,22 @@ int foo(int n) { aa += 1; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l261}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l276}}_worker() + // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, + // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, + // CHECK: store i8* null, i8** [[OMP_WORK_FN]], + // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], // CHECK: br label {{%?}}[[AWAIT_WORK:.+]] // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]], - // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0 + // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], + // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] // // CHECK: [[SEL_WORKERS]] - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]] - // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]] + // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]], + // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] // // CHECK: [[EXEC_PARALLEL]] @@ -188,7 +200,7 @@ int foo(int n) { // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+foo.+l261]](i[[SZ]] + // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l276]](i[[SZ]] // Create local storage for each capture. // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] // CHECK: [[LOCAL_B:%.+]] = alloca [10 x float]* @@ -219,26 +231,29 @@ int foo(int n) { // CHECK-DAG: [[REF_CN:%.+]] = load double*, double** [[LOCAL_CN]], // CHECK-DAG: [[REF_D:%.+]] = load [[TT]]*, [[TT]]** [[LOCAL_D]], // - // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[A:%.+]] = sub i32 [[WS]], 1 - // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1 - // CHECK: [[MID:%.+]] = and i32 [[B]], - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]] - // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]] - // - // CHECK: [[CHECK_WORKER]] - // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]] - // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]] + // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]] + // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] + // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] // // CHECK: [[WORKER]] - // CHECK: {{call|invoke}} void [[T4]]_worker() - // CHECK: br label {{%?}}[[EXIT]] + // CHECK: {{call|invoke}} void [[T3]]_worker() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK: [[CHECK_MASTER]] + // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], + // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] // // CHECK: [[MASTER]] - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]]) + // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] + // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] // // Use captures. // CHECK-64-DAG: load i32, i32* [[REF_A]] @@ -249,10 +264,10 @@ int foo(int n) { // CHECK-DAG: getelementptr inbounds double, double* [[REF_CN]], i[[SZ]] %{{.+}} // CHECK-DAG: getelementptr inbounds [[TT]], [[TT]]* [[REF_D]], i32 0, i32 0 // - // CHECK: br label {{%?}}[[TERM:.+]] + // CHECK: br label {{%?}}[[TERMINATE:.+]] // - // CHECK: [[TERM]] - // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]], + // CHECK: [[TERMINATE]] + // CHECK: call void @__kmpc_kernel_deinit() // CHECK: call void @llvm.nvvm.barrier0() // CHECK: br label {{%?}}[[EXIT]] // @@ -338,19 +353,22 @@ int bar(int n){ return a; } - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+l298}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+313}}_worker() + // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, + // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, + // CHECK: store i8* null, i8** [[OMP_WORK_FN]], + // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], // CHECK: br label {{%?}}[[AWAIT_WORK:.+]] // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]], - // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0 + // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], + // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] // // CHECK: [[SEL_WORKERS]] - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]] - // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]] + // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]], + // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] // // CHECK: [[EXEC_PARALLEL]] @@ -366,7 +384,7 @@ int bar(int n){ // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+static.+l298]](i[[SZ]] + // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l313]](i[[SZ]] // Create local storage for each capture. // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] // CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]] @@ -382,36 +400,37 @@ int bar(int n){ // CHECK-DAG: [[REF_AAA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AAA]] to i8* // CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]], // - // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[A:%.+]] = sub i32 [[WS]], 1 - // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1 - // CHECK: [[MID:%.+]] = and i32 [[B]], - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]] - // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]] - // - // CHECK: [[CHECK_WORKER]] - // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]] - // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]] + // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]] + // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] + // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] // // CHECK: [[WORKER]] - // CHECK: {{call|invoke}} void [[T5]]_worker() - // CHECK: br label {{%?}}[[EXIT]] + // CHECK: {{call|invoke}} void [[T4]]_worker() + // CHECK: br label {{%?}}[[EXIT:.+]] // - // CHECK: [[MASTER]] - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]]) + // CHECK: [[CHECK_MASTER]] + // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], + // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] // + // CHECK: [[MASTER]] + // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] + // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] // CHECK-64-DAG: load i32, i32* [[REF_A]] // CHECK-32-DAG: load i32, i32* [[LOCAL_A]] // CHECK-DAG: load i16, i16* [[REF_AA]] // CHECK-DAG: getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2 + // CHECK: br label {{%?}}[[TERMINATE:.+]] // - // CHECK: br label {{%?}}[[TERM:.+]] - // - // CHECK: [[TERM]] - // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]], + // CHECK: [[TERMINATE]] + // CHECK: call void @__kmpc_kernel_deinit() // CHECK: call void @llvm.nvvm.barrier0() // CHECK: br label {{%?}}[[EXIT]] // @@ -420,19 +439,22 @@ int bar(int n){ - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l316}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l331}}_worker() + // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, + // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, + // CHECK: store i8* null, i8** [[OMP_WORK_FN]], + // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], // CHECK: br label {{%?}}[[AWAIT_WORK:.+]] // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]], - // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0 + // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], + // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] // // CHECK: [[SEL_WORKERS]] - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]] - // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]] + // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]], + // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] // // CHECK: [[EXEC_PARALLEL]] @@ -448,7 +470,7 @@ int bar(int n){ // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+S1.+l316]]( + // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l331]]( // Create local storage for each capture. // CHECK: [[LOCAL_THIS:%.+]] = alloca [[S1:%struct.*]]* // CHECK: [[LOCAL_B:%.+]] = alloca i[[SZ]] @@ -466,35 +488,39 @@ int bar(int n){ // CHECK-DAG: [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA1]], // CHECK-DAG: [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA2]], // CHECK-DAG: [[REF_C:%.+]] = load i16*, i16** [[LOCAL_C]], - // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[A:%.+]] = sub i32 [[WS]], 1 - // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1 - // CHECK: [[MID:%.+]] = and i32 [[B]], - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]] - // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]] - // - // CHECK: [[CHECK_WORKER]] - // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]] - // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]] + // + // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]] + // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] + // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] // // CHECK: [[WORKER]] - // CHECK: {{call|invoke}} void [[T6]]_worker() - // CHECK: br label {{%?}}[[EXIT]] + // CHECK: {{call|invoke}} void [[T5]]_worker() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK: [[CHECK_MASTER]] + // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], + // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] // // CHECK: [[MASTER]] - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]]) + // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] + // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] // Use captures. // CHECK-DAG: getelementptr inbounds [[S1]], [[S1]]* [[REF_THIS]], i32 0, i32 0 // CHECK-64-DAG:load i32, i32* [[REF_B]] // CHECK-32-DAG:load i32, i32* [[LOCAL_B]] // CHECK-DAG: getelementptr inbounds i16, i16* [[REF_C]], i[[SZ]] %{{.+}} - // CHECK: br label {{%?}}[[TERM:.+]] + // CHECK: br label {{%?}}[[TERMINATE:.+]] // - // CHECK: [[TERM]] - // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]], + // CHECK: [[TERMINATE]] + // CHECK: call void @__kmpc_kernel_deinit() // CHECK: call void @llvm.nvvm.barrier0() // CHECK: br label {{%?}}[[EXIT]] // @@ -503,19 +529,22 @@ int bar(int n){ - // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l281}}_worker() + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l296}}_worker() + // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, + // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, + // CHECK: store i8* null, i8** [[OMP_WORK_FN]], + // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], // CHECK: br label {{%?}}[[AWAIT_WORK:.+]] // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]], - // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0 + // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], + // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] // // CHECK: [[SEL_WORKERS]] - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]] - // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]] + // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]], + // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] // // CHECK: [[EXEC_PARALLEL]] @@ -531,7 +560,7 @@ int bar(int n){ // CHECK: [[EXIT]] // CHECK: ret void - // CHECK: define {{.*}}void [[T7:@__omp_offloading_.+template.+l281]](i[[SZ]] + // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l296]](i[[SZ]] // Create local storage for each capture. // CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]] // CHECK: [[LOCAL_AA:%.+]] = alloca i[[SZ]] @@ -544,36 +573,39 @@ int bar(int n){ // CHECK-DAG: [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16* // CHECK-DAG: [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]], // - // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - // CHECK: [[A:%.+]] = sub i32 [[WS]], 1 - // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1 - // CHECK: [[MID:%.+]] = and i32 [[B]], - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]] - // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]] - // - // CHECK: [[CHECK_WORKER]] - // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]] - // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]] + // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]] + // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] + // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] // // CHECK: [[WORKER]] - // CHECK: {{call|invoke}} void [[T7]]_worker() - // CHECK: br label {{%?}}[[EXIT]] + // CHECK: {{call|invoke}} void [[T6]]_worker() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK: [[CHECK_MASTER]] + // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], + // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] // // CHECK: [[MASTER]] - // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() - // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]]) + // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] + // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] // // CHECK-64-DAG: load i32, i32* [[REF_A]] // CHECK-32-DAG: load i32, i32* [[LOCAL_A]] // CHECK-DAG: load i16, i16* [[REF_AA]] // CHECK-DAG: getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2 // - // CHECK: br label {{%?}}[[TERM:.+]] + // CHECK: br label {{%?}}[[TERMINATE:.+]] // - // CHECK: [[TERM]] - // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]], + // CHECK: [[TERMINATE]] + // CHECK: call void @__kmpc_kernel_deinit() // CHECK: call void @llvm.nvvm.barrier0() // CHECK: br label {{%?}}[[EXIT]] // -- 2.40.0