From 456575d0e83a9cbf7e35e844a431a0f3ec58c582 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 3 Apr 2019 17:57:06 +0000 Subject: [PATCH] [OPENMP]Add codegen for firstprivate vars with allocate clause. Added codegen/test for the firstprivatized variables with the allocate clause. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@357617 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CGDecl.cpp | 17 +++-- lib/CodeGen/CGOpenMPRuntime.cpp | 7 ++- lib/CodeGen/CGStmtOpenMP.cpp | 8 ++- test/OpenMP/parallel_firstprivate_codegen.cpp | 62 +++++++++++++++++-- 4 files changed, 76 insertions(+), 18 deletions(-) diff --git a/lib/CodeGen/CGDecl.cpp b/lib/CodeGen/CGDecl.cpp index 8a871a04f0..6f27d879a9 100644 --- a/lib/CodeGen/CGDecl.cpp +++ b/lib/CodeGen/CGDecl.cpp @@ -1457,7 +1457,13 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) { Address address = Address::invalid(); Address AllocaAddr = Address::invalid(); - if (Ty->isConstantSizeType()) { + Address OpenMPLocalAddr = + getLangOpts().OpenMP + ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D) + : Address::invalid(); + if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) { + address = OpenMPLocalAddr; + } else if (Ty->isConstantSizeType()) { bool NRVO = getLangOpts().ElideConstructors && D.isNRVOVariable(); @@ -1500,14 +1506,7 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) { // unless: // - it's an NRVO variable. // - we are compiling OpenMP and it's an OpenMP local variable. - - Address OpenMPLocalAddr = - getLangOpts().OpenMP - ? CGM.getOpenMPRuntime().getAddressOfLocalVariable(*this, &D) - : Address::invalid(); - if (getLangOpts().OpenMP && OpenMPLocalAddr.isValid()) { - address = OpenMPLocalAddr; - } else if (NRVO) { + if (NRVO) { // The named return value optimization: allocate this variable in the // return slot, so that we can elide the copy when returning this // variable (C++0x [class.copy]p34). diff --git a/lib/CodeGen/CGOpenMPRuntime.cpp b/lib/CodeGen/CGOpenMPRuntime.cpp index 65c5c1ca30..67465c1acf 100644 --- a/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/lib/CodeGen/CGOpenMPRuntime.cpp @@ -9768,10 +9768,13 @@ Address CGOpenMPRuntime::getAddressOfLocalVariable(CodeGenFunction &CGF, CharUnits Align = CGM.getContext().getDeclAlign(CVD); if (CVD->getType()->isVariablyModifiedType()) { Size = CGF.getTypeSize(CVD->getType()); - Align = CGM.getContext().getTypeAlignInChars(CVD->getType()); + // Align the size: ((size + align - 1) / align) * align + Size = CGF.Builder.CreateNUWAdd( + Size, CGM.getSize(Align - CharUnits::fromQuantity(1))); + Size = CGF.Builder.CreateUDiv(Size, CGM.getSize(Align)); + Size = CGF.Builder.CreateNUWMul(Size, CGM.getSize(Align)); } else { CharUnits Sz = CGM.getContext().getTypeSizeInChars(CVD->getType()); - Align = CGM.getContext().getDeclAlign(CVD); Size = CGM.getSize(Sz.alignTo(Align)); } llvm::Value *ThreadID = getThreadID(CGF, CVD->getBeginLoc()); diff --git a/lib/CodeGen/CGStmtOpenMP.cpp b/lib/CodeGen/CGStmtOpenMP.cpp index e739a117bd..d27afcdd33 100644 --- a/lib/CodeGen/CGStmtOpenMP.cpp +++ b/lib/CodeGen/CGStmtOpenMP.cpp @@ -750,8 +750,10 @@ bool CodeGenFunction::EmitOMPFirstprivateClause(const OMPExecutableDirective &D, bool ThisFirstprivateIsLastprivate = Lastprivates.count(OrigVD->getCanonicalDecl()) > 0; const FieldDecl *FD = CapturedStmtInfo->lookup(OrigVD); + const auto *VD = cast(cast(IInit)->getDecl()); if (!MustEmitFirstprivateCopy && !ThisFirstprivateIsLastprivate && FD && - !FD->getType()->isReferenceType()) { + !FD->getType()->isReferenceType() && + (!VD || !VD->hasAttr())) { EmittedAsFirstprivate.insert(OrigVD->getCanonicalDecl()); ++IRef; ++InitsRef; @@ -760,7 +762,8 @@ bool CodeGenFunction::EmitOMPFirstprivateClause(const OMPExecutableDirective &D, // Do not emit copy for firstprivate constant variables in target regions, // captured by reference. if (DeviceConstTarget && OrigVD->getType().isConstant(getContext()) && - FD && FD->getType()->isReferenceType()) { + FD && FD->getType()->isReferenceType() && + (!VD || !VD->hasAttr())) { (void)CGM.getOpenMPRuntime().registerTargetFirstprivateCopy(*this, OrigVD); ++IRef; @@ -770,7 +773,6 @@ bool CodeGenFunction::EmitOMPFirstprivateClause(const OMPExecutableDirective &D, FirstprivateIsLastprivate = FirstprivateIsLastprivate || ThisFirstprivateIsLastprivate; if (EmittedAsFirstprivate.insert(OrigVD->getCanonicalDecl()).second) { - const auto *VD = cast(cast(IInit)->getDecl()); const auto *VDInit = cast(cast(*InitsRef)->getDecl()); bool IsRegistered; diff --git a/test/OpenMP/parallel_firstprivate_codegen.cpp b/test/OpenMP/parallel_firstprivate_codegen.cpp index 6772f61779..e448a406e0 100644 --- a/test/OpenMP/parallel_firstprivate_codegen.cpp +++ b/test/OpenMP/parallel_firstprivate_codegen.cpp @@ -33,6 +33,16 @@ #ifndef HEADER #define HEADER +typedef void **omp_allocator_handle_t; +extern const omp_allocator_handle_t omp_default_mem_alloc; +extern const omp_allocator_handle_t omp_large_cap_mem_alloc; +extern const omp_allocator_handle_t omp_const_mem_alloc; +extern const omp_allocator_handle_t omp_high_bw_mem_alloc; +extern const omp_allocator_handle_t omp_low_lat_mem_alloc; +extern const omp_allocator_handle_t omp_cgroup_mem_alloc; +extern const omp_allocator_handle_t omp_pteam_mem_alloc; +extern const omp_allocator_handle_t omp_thread_mem_alloc; + struct St { int a, b; St() : a(0), b(0) {} @@ -322,7 +332,7 @@ int main() { s_arr[0] = var; sivar = 2; } -#pragma omp parallel firstprivate(t_var) +#pragma omp parallel allocate(omp_default_mem_alloc: t_var) firstprivate(t_var) {} return tmain(); #endif @@ -333,6 +343,7 @@ int main() { // CHECK: [[T_VAR:%.+]] = alloca i32, // CHECK: [[T_VARCAST:%.+]] = alloca [[iz:i64|i32]], // CHECK: [[SIVARCAST:%.+]] = alloca [[iz]], +// CHECK: [[T_VARCAST1:%.+]] = alloca [[iz:i64|i32]], // CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]]) // CHECK: [[T_VARVAL:%.+]] = load i32, i32* [[T_VAR]], // CHECK-64: [[T_VARCONV:%.+]] = bitcast i64* [[T_VARCAST]] to i32* @@ -345,6 +356,12 @@ int main() { // CHECK-32: store i32 [[SIVARVAL]], i32* [[SIVARCAST]], // CHECK: [[SIVARPVT:%.+]] = load [[iz]], [[iz]]* [[SIVARCAST]], // CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x i32]*, [[iz]], [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]*, i{{[0-9]+}})* [[MAIN_MICROTASK:@.+]] to void {{.*}}[[iz]] [[T_VARPVT]],{{.*}}[[iz]] [[SIVARPVT]] +// CHECK: [[T_VARVAL:%.+]] = load i32, i32* [[T_VAR]], +// CHECK-64: [[T_VARCONV:%.+]] = bitcast i64* [[T_VARCAST1]] to i32* +// CHECK-64: store i32 [[T_VARVAL]], i32* [[T_VARCONV]], +// CHECK-32: store i32 [[T_VARVAL]], i32* [[T_VARCAST1]], +// CHECK: [[T_VARPVT:%.+]] = load [[iz]], [[iz]]* [[T_VARCAST1]], +// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[iz]])* [[MAIN_MICROTASK1:@.+]] to void {{.*}}[[iz]] [[T_VARPVT]]) // CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]() // CHECK: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]* // CHECK: ret @@ -387,6 +404,24 @@ int main() { // CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]]) // CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* // CHECK: ret void + + +// CHECK: define internal void [[MAIN_MICROTASK1]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[iz]] [[T_VAR:%.+]]) +// CHECK: [[GTID_ADDR:%.+]] = alloca i32*, +// CHECK: store [[iz]] [[T_VAR]], [[iz]]* [[T_VAR_ADDR:%.+]], +// CHECK-64: [[BC:%.+]] = bitcast [[iz]]* [[T_VAR_ADDR]] to i32* +// CHECK: [[GTID_PTR:%.+]] = load i32*, i32** [[GTID_ADDR]], +// CHECK: [[GTID:%.+]] = load i32, i32* [[GTID_PTR]], +// CHECK: [[ALLOCATOR:%.+]] = load i8**, i8*** @omp_default_mem_alloc, +// CHECK: [[T_VAR_VOID_PTR:%.+]] = call i8* @__kmpc_alloc(i32 [[GTID]], [[iz]] 4, i8** [[ALLOCATOR]]) +// CHECK: [[T_VAR_PRIV:%.+]] = bitcast i8* [[T_VAR_VOID_PTR]] to i32* +// CHECK-32: [[T_VAR_VAL:%.+]] = load i32, i32* [[T_VAR_ADDR]], +// CHECK-64: [[T_VAR_VAL:%.+]] = load i32, i32* [[BC]], +// CHECK: store i32 [[T_VAR_VAL]], i32* [[T_VAR_PRIV]], +// CHECK: call void @__kmpc_free(i32 [[GTID]], i8* [[T_VAR_VOID_PTR]], i8** [[ALLOCATOR]]) +// CHECK: ret void + + // CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]() // CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]], // CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]]) @@ -480,6 +515,16 @@ int main() { #endif #else +typedef void **omp_allocator_handle_t; +extern const omp_allocator_handle_t omp_default_mem_alloc; +extern const omp_allocator_handle_t omp_large_cap_mem_alloc; +extern const omp_allocator_handle_t omp_const_mem_alloc; +extern const omp_allocator_handle_t omp_high_bw_mem_alloc; +extern const omp_allocator_handle_t omp_low_lat_mem_alloc; +extern const omp_allocator_handle_t omp_cgroup_mem_alloc; +extern const omp_allocator_handle_t omp_pteam_mem_alloc; +extern const omp_allocator_handle_t omp_thread_mem_alloc; + struct St { int a, b; St() : a(0), b(0) {} @@ -488,7 +533,7 @@ struct St { void St_func(St s[2], int n, long double vla1[n]) { double vla2[n][n] __attribute__((aligned(128))); a = b; -#pragma omp parallel firstprivate(s, vla1, vla2) +#pragma omp parallel allocate(omp_thread_mem_alloc:vla2) firstprivate(s, vla1, vla2) vla1[b] = vla2[1][n - 1] = a = b; } }; @@ -521,9 +566,18 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // ARRAY-DAG: store %struct.St* %{{.+}}, %struct.St** [[PRIV_S]], // ARRAY-DAG: store x86_fp80* %{{.+}}, x86_fp80** [[PRIV_VLA1]], // ARRAY-DAG: store double* %{{.+}}, double** [[PRIV_VLA2]], -// ARRAY: call i8* @llvm.stacksave() // ARRAY: [[SIZE:%.+]] = mul nuw i64 %{{.+}}, 8 -// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 128 %{{.+}}, i8* align 128 %{{.+}}, i64 [[SIZE]], i1 false) +// ARRAY: [[SZ1:%.+]] = add nuw i64 [[SIZE]], 127 +// ARRAY: [[SZ2:%.+]] = udiv i64 [[SZ1]], 128 +// ARRAY: [[SIZE:%.+]] = mul nuw i64 [[SZ2]], 128 +// ARRAY: [[ALLOCATOR:%.+]] = load i8**, i8*** @omp_thread_mem_alloc, +// ARRAY: [[VLA2_VOID_PTR:%.+]] = call i8* @__kmpc_alloc(i32 [[GTID:%.+]], i64 [[SIZE]], i8** [[ALLOCATOR]]) +// ARRAY: [[VLA2_PTR:%.+]] = bitcast i8* [[VLA2_VOID_PTR]] to double* +// ARRAY: [[SIZE:%.+]] = mul nuw i64 %{{.+}}, 8 +// ARRAY: [[BC:%.+]] = bitcast double* [[VLA2_PTR]] to i8* +// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 128 [[BC]], i8* align 128 %{{.+}}, i64 [[SIZE]], i1 false) +// ARRAY: call void @__kmpc_free(i32 [[GTID]], i8* [[VLA2_VOID_PTR]], i8** [[ALLOCATOR]]) +// ARRAY-NEXT: ret void #endif -- 2.40.0