From c0a2732d30f9ffb73b9b449f2c1d4c858f153dfe Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 11 Oct 2018 18:30:31 +0000 Subject: [PATCH] [OPENMP][NVPTX]Reduce memory use for globalized vars in target/teams/distribute regions. Previously introduced globalization scheme that uses memory coalescing scheme may increase memory usage fr the variables that are devlared in target/teams/distribute contexts. We don't need 32 copies of such variables, just 1. Patch reduces memory use in this case. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@344273 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 22 +++++++++++++------ ...stribute_parallel_generic_mode_codegen.cpp | 7 ++---- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 04de395662..944130b1ab 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -203,7 +203,8 @@ static RecordDecl *buildRecordForGlobalizedVars( std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(), stable_sort_comparator); // Build struct _globalized_locals_ty { - // /* globalized vars */[32] align (max(decl_align, 128)) + // /* globalized vars */[WarSize] align (max(decl_align, + // GlobalMemoryAlignment)) // /* globalized vars */ for EscapedDeclsForTeams // }; RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty"); @@ -370,11 +371,16 @@ class CheckVarsEscapingDeclContext final } } - void buildRecordForGlobalizedVars() { + void buildRecordForGlobalizedVars(bool IsInTargetMasterThreadRegion) { assert(!GlobalizedRD && "Record for globalized variables is built already."); + ArrayRef EscapedDeclsForParallel, EscapedDeclsForTeams; + if (IsInTargetMasterThreadRegion) + EscapedDeclsForTeams = EscapedDecls.getArrayRef(); + else + EscapedDeclsForParallel = EscapedDecls.getArrayRef(); GlobalizedRD = ::buildRecordForGlobalizedVars( - CGF.getContext(), EscapedDecls.getArrayRef(), llvm::None, + CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams, MappedDeclsFields); } @@ -521,9 +527,9 @@ public: /// Returns the record that handles all the escaped local variables and used /// instead of their original storage. - const RecordDecl *getGlobalizedRecord() { + const RecordDecl *getGlobalizedRecord(bool IsInTargetMasterThreadRegion) { if (!GlobalizedRD) - buildRecordForGlobalizedVars(); + buildRecordForGlobalizedVars(IsInTargetMasterThreadRegion); return GlobalizedRD; } @@ -4087,7 +4093,8 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, return; CheckVarsEscapingDeclContext VarChecker(CGF); VarChecker.Visit(Body); - const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord(); + const RecordDecl *GlobalizedVarsRecord = + VarChecker.getGlobalizedRecord(IsInTargetMasterThreadRegion); ArrayRef EscapedVariableLengthDecls = VarChecker.getEscapedVariableLengthDecls(); if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty()) @@ -4105,7 +4112,8 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { assert(VD->isCanonicalDecl() && "Expected canonical declaration"); const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); - Data.insert(std::make_pair(VD, MappedVarData(FD))); + Data.insert( + std::make_pair(VD, MappedVarData(FD, IsInTargetMasterThreadRegion))); } if (!NeedToDelayGlobalization) { emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true); diff --git a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp index 2a7344f773..2b18f6d3f9 100644 --- a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -26,13 +26,10 @@ int main(int argc, char **argv) { // CHECK-LABEL: define internal void @__omp_offloading_{{.*}}_main_l17_worker( // CHECK: define weak void @__omp_offloading_{{.*}}_main_l17([10 x i32]* dereferenceable(40) %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}) -// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 2688, i16 0) +// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 84, i16 0) // CHECK: [[STACK:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty* // CHECK: [[ARGC:%.+]] = load i32, i32* %{{.+}}, align -// CHECK: [[ARGC_ARR_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK: [[LID:%.+]] = and i32 [[TID]], 31 -// CHECK: [[ARGC_ADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[ARGC_ARR_ADDR]], i32 0, i32 [[LID]] +// CHECK: [[ARGC_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0 // CHECK: store i32 [[ARGC]], i32* [[ARGC_ADDR]], // CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 1 // CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 2 -- 2.40.0