From: Alexey Bataev Date: Mon, 23 Sep 2019 14:06:51 +0000 (+0000) Subject: [OPENMP]Call __kmpc_push_tripcount in task context. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2937e8de9dfeffa30ce7d0e7448a0ebf94c54bb5;p=clang [OPENMP]Call __kmpc_push_tripcount in task context. Runtime function __kmpc_push_tripcount better to call inside of the task context for target regions. Otherwise, the libomptarget is unable to link the provided tripcount value for nowait target regions and completely looses this information. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@372609 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/CGOpenMPRuntime.cpp b/lib/CodeGen/CGOpenMPRuntime.cpp index 45833e1139..46a9ec3ac2 100644 --- a/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/lib/CodeGen/CGOpenMPRuntime.cpp @@ -9176,9 +9176,11 @@ void CGOpenMPRuntime::emitUDMapperArrayInitOrDel( } void CGOpenMPRuntime::emitTargetNumIterationsCall( - CodeGenFunction &CGF, const OMPExecutableDirective &D, const Expr *Device, - const llvm::function_ref &SizeEmitter) { + CodeGenFunction &CGF, const OMPExecutableDirective &D, + llvm::Value *DeviceID, + llvm::function_ref + SizeEmitter) { OpenMPDirectiveKind Kind = D.getDirectiveKind(); const OMPExecutableDirective *TD = &D; // Get nested teams distribute kind directive, if any. @@ -9187,30 +9189,24 @@ void CGOpenMPRuntime::emitTargetNumIterationsCall( if (!TD) return; const auto *LD = cast(TD); - auto &&CodeGen = [LD, &Device, &SizeEmitter, this](CodeGenFunction &CGF, + auto &&CodeGen = [LD, DeviceID, SizeEmitter, this](CodeGenFunction &CGF, PrePostActionTy &) { - llvm::Value *NumIterations = SizeEmitter(CGF, *LD); - - // Emit device ID if any. - llvm::Value *DeviceID; - if (Device) - DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device), - CGF.Int64Ty, /*isSigned=*/true); - else - DeviceID = CGF.Builder.getInt64(OMP_DEVICEID_UNDEF); - - llvm::Value *Args[] = {DeviceID, NumIterations}; - CGF.EmitRuntimeCall( - createRuntimeFunction(OMPRTL__kmpc_push_target_tripcount), Args); + if (llvm::Value *NumIterations = SizeEmitter(CGF, *LD)) { + llvm::Value *Args[] = {DeviceID, NumIterations}; + CGF.EmitRuntimeCall( + createRuntimeFunction(OMPRTL__kmpc_push_target_tripcount), Args); + } }; emitInlinedDirective(CGF, OMPD_unknown, CodeGen); } -void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF, - const OMPExecutableDirective &D, - llvm::Function *OutlinedFn, - llvm::Value *OutlinedFnID, - const Expr *IfCond, const Expr *Device) { +void CGOpenMPRuntime::emitTargetCall( + CodeGenFunction &CGF, const OMPExecutableDirective &D, + llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, const Expr *IfCond, + const Expr *Device, + llvm::function_ref + SizeEmitter) { if (!CGF.HaveInsertPoint()) return; @@ -9229,8 +9225,8 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF, llvm::Value *MapTypesArray = nullptr; // Fill up the pointer arrays and transfer execution to the device. auto &&ThenGen = [this, Device, OutlinedFn, OutlinedFnID, &D, &InputInfo, - &MapTypesArray, &CS, RequiresOuterTask, - &CapturedVars](CodeGenFunction &CGF, PrePostActionTy &) { + &MapTypesArray, &CS, RequiresOuterTask, &CapturedVars, + SizeEmitter](CodeGenFunction &CGF, PrePostActionTy &) { // On top of the arrays that were filled up, the target offloading call // takes as arguments the device id as well as the host pointer. The host // pointer is used by the runtime library to identify the current target @@ -9262,6 +9258,9 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF, llvm::Value *NumTeams = emitNumTeamsForTargetDirective(CGF, D); llvm::Value *NumThreads = emitNumThreadsForTargetDirective(CGF, D); + // Emit tripcount for the target loop-based directive. + emitTargetNumIterationsCall(CGF, D, DeviceID, SizeEmitter); + bool HasNowait = D.hasClausesOfKind(); // The target region is an outlined function launched by the runtime // via calls __tgt_target() or __tgt_target_teams(). @@ -11285,12 +11284,13 @@ void CGOpenMPSIMDRuntime::emitTargetOutlinedFunction( llvm_unreachable("Not supported in SIMD-only mode"); } -void CGOpenMPSIMDRuntime::emitTargetCall(CodeGenFunction &CGF, - const OMPExecutableDirective &D, - llvm::Function *OutlinedFn, - llvm::Value *OutlinedFnID, - const Expr *IfCond, - const Expr *Device) { +void CGOpenMPSIMDRuntime::emitTargetCall( + CodeGenFunction &CGF, const OMPExecutableDirective &D, + llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, const Expr *IfCond, + const Expr *Device, + llvm::function_ref + SizeEmitter) { llvm_unreachable("Not supported in SIMD-only mode"); } diff --git a/lib/CodeGen/CGOpenMPRuntime.h b/lib/CodeGen/CGOpenMPRuntime.h index 92e946c670..afec628feb 100644 --- a/lib/CodeGen/CGOpenMPRuntime.h +++ b/lib/CodeGen/CGOpenMPRuntime.h @@ -793,6 +793,17 @@ private: /// default. virtual unsigned getDefaultFirstprivateAddressSpace() const { return 0; } + /// Emit code that pushes the trip count of loops associated with constructs + /// 'target teams distribute' and 'teams distribute parallel for'. + /// \param SizeEmitter Emits the int64 value for the number of iterations of + /// the associated loop. + void emitTargetNumIterationsCall( + CodeGenFunction &CGF, const OMPExecutableDirective &D, + llvm::Value *DeviceID, + llvm::function_ref + SizeEmitter); + public: explicit CGOpenMPRuntime(CodeGenModule &CGM) : CGOpenMPRuntime(CGM, ".", ".") {} @@ -1414,15 +1425,6 @@ public: bool IsOffloadEntry, const RegionCodeGenTy &CodeGen); - /// Emit code that pushes the trip count of loops associated with constructs - /// 'target teams distribute' and 'teams distribute parallel for'. - /// \param SizeEmitter Emits the int64 value for the number of iterations of - /// the associated loop. - virtual void emitTargetNumIterationsCall( - CodeGenFunction &CGF, const OMPExecutableDirective &D, const Expr *Device, - const llvm::function_ref &SizeEmitter); - /// Emit the target offloading code associated with \a D. The emitted /// code attempts offloading the execution to the device, an the event of /// a failure it executes the host version outlined in \a OutlinedFn. @@ -1433,11 +1435,15 @@ public: /// directive, or null if no if clause is used. /// \param Device Expression evaluated in device clause associated with the /// target directive, or null if no device clause is used. - virtual void emitTargetCall(CodeGenFunction &CGF, - const OMPExecutableDirective &D, - llvm::Function *OutlinedFn, - llvm::Value *OutlinedFnID, const Expr *IfCond, - const Expr *Device); + /// \param SizeEmitter Callback to emit number of iterations for loop-based + /// directives. + virtual void + emitTargetCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, + llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, + const Expr *IfCond, const Expr *Device, + llvm::function_ref + SizeEmitter); /// Emit the target regions enclosed in \a GD function definition or /// the function itself in case it is a valid device function. Returns true if @@ -2117,9 +2123,13 @@ public: /// directive, or null if no if clause is used. /// \param Device Expression evaluated in device clause associated with the /// target directive, or null if no device clause is used. - void emitTargetCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, - llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, - const Expr *IfCond, const Expr *Device) override; + void + emitTargetCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, + llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, + const Expr *IfCond, const Expr *Device, + llvm::function_ref + SizeEmitter) override; /// Emit the target regions enclosed in \a GD function definition or /// the function itself in case it is a valid device function. Returns true if diff --git a/lib/CodeGen/CGStmtOpenMP.cpp b/lib/CodeGen/CGStmtOpenMP.cpp index e34b820625..b41ad55322 100644 --- a/lib/CodeGen/CGStmtOpenMP.cpp +++ b/lib/CodeGen/CGStmtOpenMP.cpp @@ -4137,18 +4137,21 @@ static void emitCommonOMPTargetDirective(CodeGenFunction &CGF, CGM.getOpenMPRuntime().emitTargetOutlinedFunction(S, ParentName, Fn, FnID, IsOffloadEntry, CodeGen); OMPLexicalScope Scope(CGF, S, OMPD_task); - auto &&SizeEmitter = [](CodeGenFunction &CGF, const OMPLoopDirective &D) { - OMPLoopScope(CGF, D); - // Emit calculation of the iterations count. - llvm::Value *NumIterations = CGF.EmitScalarExpr(D.getNumIterations()); - NumIterations = CGF.Builder.CreateIntCast(NumIterations, CGF.Int64Ty, - /*isSigned=*/false); - return NumIterations; + auto &&SizeEmitter = + [IsOffloadEntry](CodeGenFunction &CGF, + const OMPLoopDirective &D) -> llvm::Value * { + if (IsOffloadEntry) { + OMPLoopScope(CGF, D); + // Emit calculation of the iterations count. + llvm::Value *NumIterations = CGF.EmitScalarExpr(D.getNumIterations()); + NumIterations = CGF.Builder.CreateIntCast(NumIterations, CGF.Int64Ty, + /*isSigned=*/false); + return NumIterations; + } + return nullptr; }; - if (IsOffloadEntry) - CGM.getOpenMPRuntime().emitTargetNumIterationsCall(CGF, S, Device, - SizeEmitter); - CGM.getOpenMPRuntime().emitTargetCall(CGF, S, Fn, FnID, IfCond, Device); + CGM.getOpenMPRuntime().emitTargetCall(CGF, S, Fn, FnID, IfCond, Device, + SizeEmitter); } static void emitTargetRegion(CodeGenFunction &CGF, const OMPTargetDirective &S, diff --git a/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp index 4c7ac2441c..1617f8510a 100644 --- a/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp +++ b/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp @@ -54,17 +54,14 @@ int target_teams_fun(int *g){ // discard capture expressions for te and th // HCK1: = alloca i32, // HCK1: = alloca i32, - // HCK1: = alloca i32, - // HCK1: = alloca i32, - // HCK1: = alloca i32, // HCK1: [[N_CAST:%.+]] = alloca i{{32|64}}, // HCK1: [[TE_CAST:%.+]] = alloca i{{32|64}}, // HCK1: [[TH_CAST:%.+]] = alloca i{{32|64}}, - // HCK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}}) // HCK1: [[N_PAR:%.+]] = load{{.+}}, {{.+}} [[N_CAST]], // HCK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]], // HCK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]], - // HCK1: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 4, i8** %{{[^,]+}}, i8** %{{[^,]+}}, + // HCK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}}) + // HCK1: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 4, i8** %{{[^,]+}}, i8** %{{[^,]+}}, // HCK1: call void @[[OFFL1:.+]](i{{32|64}} [[N_PAR]], {{.+}}, i{{32|64}} [[TE_PAR]], i{{32|64}} [[TH_PAR]]) #pragma omp target teams distribute parallel for num_teams(te), thread_limit(th) diff --git a/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp b/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp index 6846aaf434..2c74b18fbd 100644 --- a/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp +++ b/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp @@ -52,18 +52,15 @@ int target_teams_fun(int *g){ // discard capture expressions for te and th // HCK1: = alloca i32, // HCK1: = alloca i32, -// HCK1: = alloca i32, -// HCK1: = alloca i32, -// HCK1: = alloca i32, // HCK1: [[I_CAST:%.+]] = alloca i{{32|64}}, // HCK1: [[N_CAST:%.+]] = alloca i{{32|64}}, // HCK1: [[TE_CAST:%.+]] = alloca i{{32|64}}, // HCK1: [[TH_CAST:%.+]] = alloca i{{32|64}}, -// HCK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}}) // HCK1: [[I_PAR:%.+]] = load{{.+}}, {{.+}} [[I_CAST]], // HCK1: [[N_PAR:%.+]] = load{{.+}}, {{.+}} [[N_CAST]], // HCK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]], // HCK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]], +// HCK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}}) // HCK1: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 5, i8** %{{[^,]+}}, i8** %{{[^,]+}}, // HCK1: call void @[[OFFL1:.+]](i{{32|64}} [[I_PAR]], i{{32|64}} [[N_PAR]], {{.+}}, i{{32|64}} [[TE_PAR]], i{{32|64}} [[TH_PAR]]) @@ -77,8 +74,7 @@ int target_teams_fun(int *g){ // HCK1: call void @[[OFFL2:.+]](i{{64|32}} %{{.+}}) {{{ #pragma omp target teams distribute parallel for simd is_device_ptr(g) simdlen(8) - for( - int i = 0; i < n; i++) { + for(int i = 0; i < n; i++) { a[i] = g[0]; } }}} diff --git a/test/OpenMP/teams_distribute_codegen.cpp b/test/OpenMP/teams_distribute_codegen.cpp index ea299bc346..0065c6cbd5 100644 --- a/test/OpenMP/teams_distribute_codegen.cpp +++ b/test/OpenMP/teams_distribute_codegen.cpp @@ -21,22 +21,19 @@ int a[100]; // CK1: define {{.*}}i32 @{{.+}}teams_argument_globali( -int teams_argument_global(int n){ +int teams_argument_global(int n) { int te = n / 128; int th = 128; // discard n_addr // CK1: alloca i32, // CK1: [[TE:%.+]] = alloca i32, // CK1: [[TH:%.+]] = alloca i32, - // CK1: alloca i32, - // CK1: alloca i32, - // CK1: alloca i32, // CK1: [[TE_CAST:%.+]] = alloca i{{32|64}}, // CK1: [[TH_CAST:%.+]] = alloca i{{32|64}}, - // CK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}}) // CK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]], // CK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]], + // CK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}}) // CK1: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 4, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 {{.+}}, i32 {{.+}}) // CK1: call void @[[OFFL1:.+]](i{{32|64}} [[TE_PAR]], i{{32|64}} [[TH_PAR]], diff --git a/test/OpenMP/teams_distribute_parallel_for_codegen.cpp b/test/OpenMP/teams_distribute_parallel_for_codegen.cpp index fa425b3eca..bc42f740ad 100644 --- a/test/OpenMP/teams_distribute_parallel_for_codegen.cpp +++ b/test/OpenMP/teams_distribute_parallel_for_codegen.cpp @@ -28,14 +28,11 @@ int teams_argument_global(int n){ // CK1: alloca i32, // CK1: [[TE:%.+]] = alloca i32, // CK1: [[TH:%.+]] = alloca i32, - // CK1: alloca i32, - // CK1: alloca i32, - // CK1: alloca i32, // CK1: [[TE_CAST:%.+]] = alloca i{{32|64}}, // CK1: [[TH_CAST:%.+]] = alloca i{{32|64}}, - // CK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}}) // CK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]], // CK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]], + // CK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}}) // CK1: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 4, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 {{.+}}, i32 {{.+}}) // CK1: call void @[[OFFL1:.+]](i{{32|64}} [[TE_PAR]], i{{32|64}} [[TH_PAR]], diff --git a/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp b/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp index 45793419d7..fd2b2da533 100644 --- a/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp +++ b/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp @@ -28,15 +28,12 @@ int teams_argument_global(int n){ // CK1: alloca i32, // CK1: [[TE:%.+]] = alloca i32, // CK1: [[TH:%.+]] = alloca i32, - // CK1: alloca i32, - // CK1: alloca i32, - // CK1: alloca i32, // CK1: [[TE_CAST:%.+]] = alloca i{{32|64}}, // CK1: [[TH_CAST:%.+]] = alloca i{{32|64}}, - // CK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}}) // CK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]], // CK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]], + // CK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}}) // CK1: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 4, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0) // CK1: call void @[[OFFL1:.+]](i{{32|64}} [[TE_PAR]], i{{32|64}} [[TH_PAR]], diff --git a/test/OpenMP/teams_distribute_simd_codegen.cpp b/test/OpenMP/teams_distribute_simd_codegen.cpp index ab1482855e..ff2a752cfa 100644 --- a/test/OpenMP/teams_distribute_simd_codegen.cpp +++ b/test/OpenMP/teams_distribute_simd_codegen.cpp @@ -30,15 +30,12 @@ int teams_argument_global(int n) { // CK1: alloca i32, // CK1: [[TE:%.+]] = alloca i32, // CK1: [[TH:%.+]] = alloca i32, - // CK1: alloca i32, - // CK1: alloca i32, - // CK1: alloca i32, // CK1: [[TE_CAST:%.+]] = alloca i{{32|64}}, // CK1: [[TH_CAST:%.+]] = alloca i{{32|64}}, - // CK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}}) // CK1: [[TE_PAR:%.+]] = load{{.+}}, {{.+}} [[TE_CAST]], // CK1: [[TH_PAR:%.+]] = load{{.+}}, {{.+}} [[TH_CAST]], + // CK1: call void @__kmpc_push_target_tripcount(i64 -1, i64 %{{.+}}) // CK1: call i32 @__tgt_target_teams(i64 -1, i8* @{{[^,]+}}, i32 5, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i64* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 {{.+}}, i32 1) // CK1: call void @[[OFFL1:.+]](i{{32|64}} [[TE_PAR]], i{{32|64}} [[TH_PAR]],