From 0a80d6bb5733baa1a7cea517eea76400d7091a54 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 12 Oct 2018 20:19:59 +0000 Subject: [PATCH] [OPENMP][NVPTX]Reduce memory usage in target region. Additional reduction of the global memory usage in the target regions without parallel regions. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@344413 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 29 +++++++++++++++----------- lib/CodeGen/CGOpenMPRuntimeNVPTX.h | 3 +++ test/OpenMP/nvptx_data_sharing.cpp | 12 +++-------- test/OpenMP/nvptx_parallel_codegen.cpp | 7 ++----- test/OpenMP/nvptx_teams_codegen.cpp | 28 +++++++------------------ 5 files changed, 33 insertions(+), 46 deletions(-) diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 7ae8377311..4d5a580bd6 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -371,11 +371,11 @@ class CheckVarsEscapingDeclContext final } } - void buildRecordForGlobalizedVars(bool IsInTargetMasterThreadRegion) { + void buildRecordForGlobalizedVars(bool IsInTTDRegion) { assert(!GlobalizedRD && "Record for globalized variables is built already."); ArrayRef EscapedDeclsForParallel, EscapedDeclsForTeams; - if (IsInTargetMasterThreadRegion) + if (IsInTTDRegion) EscapedDeclsForTeams = EscapedDecls.getArrayRef(); else EscapedDeclsForParallel = EscapedDecls.getArrayRef(); @@ -527,9 +527,9 @@ public: /// Returns the record that handles all the escaped local variables and used /// instead of their original storage. - const RecordDecl *getGlobalizedRecord(bool IsInTargetMasterThreadRegion) { + const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) { if (!GlobalizedRD) - buildRecordForGlobalizedVars(IsInTargetMasterThreadRegion); + buildRecordForGlobalizedVars(IsInTTDRegion); return GlobalizedRD; } @@ -1132,8 +1132,10 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D, } } Action(EST, WST); CodeGen.setAction(Action); + IsInTTDRegion = true; emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); + IsInTTDRegion = false; // Now change the name of the worker function to correspond to this target // region's entry function. @@ -1246,8 +1248,10 @@ void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D, } } Action(*this, EST, D); CodeGen.setAction(Action); + IsInTTDRegion = true; emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); + IsInTTDRegion = false; } static void @@ -1851,12 +1855,15 @@ llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction( } } Action(IsInParallelRegion); CodeGen.setAction(Action); + bool PrevIsInTTDRegion = IsInTTDRegion; + IsInTTDRegion = false; bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion; IsInTargetMasterThreadRegion = false; auto *OutlinedFun = cast(CGOpenMPRuntime::emitParallelOutlinedFunction( D, ThreadIDVar, InnermostKind, CodeGen)); IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion; + IsInTTDRegion = PrevIsInTTDRegion; if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD && !IsInParallelRegion) { llvm::Function *WrapperFun = @@ -4142,7 +4149,7 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, CheckVarsEscapingDeclContext VarChecker(CGF); VarChecker.Visit(Body); const RecordDecl *GlobalizedVarsRecord = - VarChecker.getGlobalizedRecord(IsInTargetMasterThreadRegion); + VarChecker.getGlobalizedRecord(IsInTTDRegion); ArrayRef EscapedVariableLengthDecls = VarChecker.getEscapedVariableLengthDecls(); if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty()) @@ -4160,22 +4167,20 @@ void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { assert(VD->isCanonicalDecl() && "Expected canonical declaration"); const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); - Data.insert( - std::make_pair(VD, MappedVarData(FD, IsInTargetMasterThreadRegion))); + Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion))); } - if (!IsInTargetMasterThreadRegion && !NeedToDelayGlobalization && - !IsInParallelRegion) { + if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) { CheckVarsEscapingDeclContext VarChecker(CGF); VarChecker.Visit(Body); I->getSecond().SecondaryGlobalRecord = - VarChecker.getGlobalizedRecord(/*IsInTargetMasterThreadRegion=*/true); + VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true); I->getSecond().SecondaryLocalVarData.emplace(); DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue(); for (const ValueDecl *VD : VarChecker.getEscapedDecls()) { assert(VD->isCanonicalDecl() && "Expected canonical declaration"); const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD); - Data.insert(std::make_pair( - VD, MappedVarData(FD, /*IsInTargetMasterThreadRegion=*/true))); + Data.insert( + std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true))); } } if (!NeedToDelayGlobalization) { diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h index 53ef13b132..d40e6cfbdd 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -360,6 +360,9 @@ private: /// true if we're emitting the code for the target region and next parallel /// region is L0 for sure. bool IsInTargetMasterThreadRegion = false; + /// true if currently emitting code for target/teams/distribute region, false + /// - otherwise. + bool IsInTTDRegion = false; /// true if we're definitely in the parallel region. bool IsInParallelRegion = false; diff --git a/test/OpenMP/nvptx_data_sharing.cpp b/test/OpenMP/nvptx_data_sharing.cpp index 8320676924..0acb119915 100644 --- a/test/OpenMP/nvptx_data_sharing.cpp +++ b/test/OpenMP/nvptx_data_sharing.cpp @@ -39,16 +39,10 @@ void test_ds(){ // CK1: [[SHAREDARGS2:%.+]] = alloca i8** // CK1: call void @__kmpc_kernel_init // CK1: call void @__kmpc_data_sharing_init_stack -// CK1: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 256, i16 0) +// CK1: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i64 8, i16 0) // CK1: [[GLOBALSTACK2:%.+]] = bitcast i8* [[GLOBALSTACK]] to %struct._globalized_locals_ty* -// CK1: [[A_ARR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0 -// CK1: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CK1: [[LID:%.+]] = and i32 [[TID]], 31 -// CK1: [[A:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[A_ARR]], i32 0, i32 [[LID]] -// CK1: [[B_ARR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1 -// CK1: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CK1: [[LID:%.+]] = and i32 [[TID]], 31 -// CK1: [[B:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[B_ARR]], i32 0, i32 [[LID]] +// CK1: [[A:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0 +// CK1: [[B:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1 // CK1: store i32 10, i32* [[A]] // CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i16 1) // CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS1]], i64 1) diff --git a/test/OpenMP/nvptx_parallel_codegen.cpp b/test/OpenMP/nvptx_parallel_codegen.cpp index 978804d249..d1a3104407 100644 --- a/test/OpenMP/nvptx_parallel_codegen.cpp +++ b/test/OpenMP/nvptx_parallel_codegen.cpp @@ -318,14 +318,11 @@ int bar(int n){ // CHECK-32: [[A_ADDR:%.+]] = alloca i32, // CHECK-64: [[A_ADDR:%.+]] = alloca i64, // CHECK-64: [[CONV:%.+]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 128, i16 0) +// CHECK: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 4, i16 0) // CHECK: [[BC:%.+]] = bitcast i8* [[STACK]] to %struct._globalized_locals_ty* // CHECK-32: [[A:%.+]] = load i32, i32* [[A_ADDR]], // CHECK-64: [[A:%.+]] = load i32, i32* [[CONV]], -// CHECK: [[GLOBAL_A_ADDR_ARR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK: [[LID:%.+]] = and i32 [[TID]], 31 -// CHECK: [[GLOBAL_A_ADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[GLOBAL_A_ADDR_ARR]], i32 0, i32 [[LID]] +// CHECK: [[GLOBAL_A_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CHECK: store i32 [[A]], i32* [[GLOBAL_A_ADDR]], // CHECK-LABEL: define internal void @{{.+}}(i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.*}}) diff --git a/test/OpenMP/nvptx_teams_codegen.cpp b/test/OpenMP/nvptx_teams_codegen.cpp index 91b372c65b..4e3f2674fc 100644 --- a/test/OpenMP/nvptx_teams_codegen.cpp +++ b/test/OpenMP/nvptx_teams_codegen.cpp @@ -36,13 +36,10 @@ int main (int argc, char **argv) { // CK1: store {{.+}} 0, {{.+}}, // CK1: store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]], // CK1-64: [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}* -// CK1: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 128, i16 0) +// CK1: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 4, i16 0) // CK1-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]] // CK1-32: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]] -// CK1: [[ARGCADDR_ARR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CK1: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CK1: [[LID:%.+]] = and i32 [[TID]], 31 -// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[ARGCADDR_ARR]], i32 0, i32 [[LID]] +// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CK1: store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]], // CK1: store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]], // CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]], @@ -56,12 +53,9 @@ int main (int argc, char **argv) { // CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{.+}}***, // CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}**, // CK1: store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]] -// CK1: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{128|256}}, i16 0) +// CK1: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{4|8}}, i16 0) // CK1: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]] -// CK1: [[ARGCADDR_ARR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CK1: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CK1: [[LID:%.+]] = and i32 [[TID]], 31 -// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds [32 x i8**], [32 x i8**]* [[ARGCADDR_ARR]], i32 0, i32 [[LID]] +// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CK1: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]], // CK1: store i8*** [[ARGCADDR]], i8**** [[ARGCADDR_PTR]], // CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{.+}}**, i{{.+}}*** [[ARGCADDR_PTR]], @@ -117,13 +111,10 @@ int main (int argc, char **argv) { // CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32* // CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32* // CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32* -// CK2: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 128, i16 0) +// CK2: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} 4, i16 0) // CK2-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]] // CK2-32: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]] -// CK2: [[ARGCADDR_ARR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CK2: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CK2: [[LID:%.+]] = and i32 [[TID]], 31 -// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[ARGCADDR_ARR]], i32 0, i32 [[LID]] +// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CK2: store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]], // CK2: {{%.+}} = call i32 @__kmpc_global_thread_num( // CK2: store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]], @@ -141,12 +132,9 @@ int main (int argc, char **argv) { // CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]], // CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]], // CK2: store i{{[0-9]+}}** [[ARGC]], i{{[0-9]+}}*** [[ARGCADDR]], -// CK2: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{128|256}}, i16 0) +// CK2: call i8* @__kmpc_data_sharing_push_stack(i{{[0-9]+}} {{4|8}}, i16 0) // CK2: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]] -// CK2: [[ARGCADDR_ARR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 -// CK2: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CK2: [[LID:%.+]] = and i32 [[TID]], 31 -// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds [32 x i8**], [32 x i8**]* [[ARGCADDR_ARR]], i32 0, i32 [[LID]] +// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CK2: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]], // CK2: {{%.+}} = call i32 @__kmpc_global_thread_num( // CK2: store i{{[0-9]+}}*** [[ARGCADDR]], i{{[0-9]+}}**** [[ARGCADDR_PTR]], -- 2.40.0