From 1188822bbcdd02f816e857f1c15df32d0d8c5336 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 4 Jan 2019 17:25:09 +0000 Subject: [PATCH] [OPENMP][NVPTX]Use new functions from the runtime library. Updated codegen to use the new functions from the runtime library. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@350415 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 73 +++++++++++-------- test/OpenMP/nvptx_data_sharing.cpp | 4 +- ...stribute_parallel_generic_mode_codegen.cpp | 4 +- test/OpenMP/nvptx_parallel_codegen.cpp | 4 +- test/OpenMP/nvptx_parallel_for_codegen.cpp | 4 +- test/OpenMP/nvptx_target_codegen.cpp | 2 +- ...vptx_target_parallel_reduction_codegen.cpp | 6 +- .../nvptx_target_teams_distribute_codegen.cpp | 2 +- ..._teams_distribute_parallel_for_codegen.cpp | 4 +- ...s_distribute_parallel_for_simd_codegen.cpp | 4 +- test/OpenMP/nvptx_teams_codegen.cpp | 8 +- test/OpenMP/nvptx_teams_reduction_codegen.cpp | 6 +- 12 files changed, 67 insertions(+), 54 deletions(-) diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 97b8f79a9f..21911c96f3 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -56,12 +56,12 @@ enum OpenMPRTLFunctionNVPTX { /// Call to int64_t __kmpc_shuffle_int64(int64_t element, /// int16_t lane_offset, int16_t warp_size); OMPRTL_NVPTX__kmpc_shuffle_int64, - /// Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32 + /// Call to __kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, kmp_int32 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data, /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t /// lane_offset, int16_t shortCircuit), /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num)); - OMPRTL_NVPTX__kmpc_parallel_reduce_nowait, + OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2, /// Call to __kmpc_nvptx_teams_reduce_nowait_simple(ident_t *loc, kmp_int32 /// global_tid, kmp_critical_name *lck) OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_simple, @@ -91,10 +91,11 @@ enum OpenMPRTLFunctionNVPTX { OMPRTL_NVPTX__kmpc_parallel_level, /// Call to int8_t __kmpc_is_spmd_exec_mode(); OMPRTL_NVPTX__kmpc_is_spmd_exec_mode, - /// Call to void __kmpc_get_team_static_memory(const void *buf, size_t size, - /// int16_t is_shared, const void **res); + /// Call to void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, + /// const void *buf, size_t size, int16_t is_shared, const void **res); OMPRTL_NVPTX__kmpc_get_team_static_memory, - /// Call to void __kmpc_restore_team_static_memory(int16_t is_shared); + /// Call to void __kmpc_restore_team_static_memory(int16_t + /// isSPMDExecutionMode, int16_t is_shared); OMPRTL_NVPTX__kmpc_restore_team_static_memory, /// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid); OMPRTL__kmpc_barrier, @@ -1646,12 +1647,12 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64"); break; } - case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: { - // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid, - // kmp_int32 num_vars, size_t reduce_size, void* reduce_data, - // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t - // lane_offset, int16_t Algorithm Version), - // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num)); + case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2: { + // Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, + // kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void* + // reduce_data, void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t + // lane_id, int16_t lane_offset, int16_t Algorithm Version), void + // (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num)); llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty, CGM.Int16Ty, CGM.Int16Ty}; auto *ShuffleReduceFnTy = @@ -1661,7 +1662,8 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { auto *InterWarpCopyFnTy = llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams, /*isVarArg=*/false); - llvm::Type *TypeParams[] = {CGM.Int32Ty, + llvm::Type *TypeParams[] = {getIdentTyPointerTy(), + CGM.Int32Ty, CGM.Int32Ty, CGM.SizeTy, CGM.VoidPtrTy, @@ -1670,7 +1672,7 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { auto *FnTy = llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction( - FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait"); + FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait_v2"); break; } case OMPRTL_NVPTX__kmpc_end_reduce_nowait: { @@ -1779,19 +1781,21 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { break; } case OMPRTL_NVPTX__kmpc_get_team_static_memory: { - // Build void __kmpc_get_team_static_memory(const void *buf, size_t size, - // int16_t is_shared, const void **res); - llvm::Type *TypeParams[] = {CGM.VoidPtrTy, CGM.SizeTy, CGM.Int16Ty, - CGM.VoidPtrPtrTy}; + // Build void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, + // const void *buf, size_t size, int16_t is_shared, const void **res); + llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.VoidPtrTy, CGM.SizeTy, + CGM.Int16Ty, CGM.VoidPtrPtrTy}; auto *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory"); break; } case OMPRTL_NVPTX__kmpc_restore_team_static_memory: { - // Build void __kmpc_restore_team_static_memory(int16_t is_shared); + // Build void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, + // int16_t is_shared); + llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty}; auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, CGM.Int16Ty, /*isVarArg=*/false); + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory"); break; @@ -2211,8 +2215,11 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF, CGM.getContext().getSizeType(), Loc); llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( KernelStaticGlobalized, CGM.VoidPtrPtrTy); - llvm::Value *GlobalRecordSizeArg[] = {StaticGlobalized, Ld, - IsInSharedMemory, ResAddr}; + llvm::Value *GlobalRecordSizeArg[] = { + llvm::ConstantInt::get( + CGM.Int16Ty, + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0), + StaticGlobalized, Ld, IsInSharedMemory, ResAddr}; CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( OMPRTL_NVPTX__kmpc_get_team_static_memory), GlobalRecordSizeArg); @@ -2400,10 +2407,15 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF, Address(GlobalizedRecords.back().UseSharedMemory, CGM.getContext().getTypeAlignInChars(Int16Ty)), /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc); + llvm::Value *Args[] = { + llvm::ConstantInt::get( + CGM.Int16Ty, + getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0), + IsInSharedMemory}; CGF.EmitRuntimeCall( createNVPTXRuntimeFunction( OMPRTL_NVPTX__kmpc_restore_team_static_memory), - IsInSharedMemory); + Args); } } else { CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( @@ -3608,7 +3620,7 @@ static llvm::Value *emitShuffleAndReduceFunction( /// 3. Call the OpenMP runtime on the GPU to reduce within a team /// and store the result on the team master: /// -/// __kmpc_nvptx_parallel_reduce_nowait(..., +/// __kmpc_nvptx_parallel_reduce_nowait_v2(..., /// reduceData, shuffleReduceFn, interWarpCpyFn) /// /// where: @@ -3779,7 +3791,7 @@ static llvm::Value *emitShuffleAndReduceFunction( /// Intra-Team Reduction /// /// This function, as implemented in the runtime call -/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP +/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP /// threads in a team. It first reduces within a warp using the /// aforementioned algorithms. We then proceed to gather all such /// reduced values at the first warp. @@ -3802,7 +3814,7 @@ static llvm::Value *emitShuffleAndReduceFunction( /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., /// the k'th worker reduces every k'th element. /// -/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to +/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to /// reduce across workers and compute a globally reduced value. /// void CGOpenMPRuntimeNVPTX::emitReduction( @@ -3832,6 +3844,7 @@ void CGOpenMPRuntimeNVPTX::emitReduction( // RedList, shuffle_reduce_func, interwarp_copy_func); // or // Build res = __kmpc_reduce_teams_nowait_simple(, , ); + llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); llvm::Value *ThreadId = getThreadID(CGF, Loc); llvm::Value *Res; @@ -3886,19 +3899,19 @@ void CGOpenMPRuntimeNVPTX::emitReduction( llvm::Value *InterWarpCopyFn = emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc); - llvm::Value *Args[] = {ThreadId, + llvm::Value *Args[] = {RTLoc, + ThreadId, CGF.Builder.getInt32(RHSExprs.size()), ReductionArrayTySize, RL, ShuffleAndReduceFn, InterWarpCopyFn}; - Res = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait), - Args); + Res = CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( + OMPRTL_NVPTX__kmpc_parallel_reduce_nowait_v2), + Args); } else { assert(TeamsReduction && "expected teams reduction."); - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); std::string Name = getName({"reduction"}); llvm::Value *Lock = getCriticalRegionLock(Name); llvm::Value *Args[] = {RTLoc, ThreadId, Lock}; diff --git a/test/OpenMP/nvptx_data_sharing.cpp b/test/OpenMP/nvptx_data_sharing.cpp index df9c3ee83b..7b21d82794 100644 --- a/test/OpenMP/nvptx_data_sharing.cpp +++ b/test/OpenMP/nvptx_data_sharing.cpp @@ -46,7 +46,7 @@ void test_ds(){ // CK1: call void @__kmpc_data_sharing_init_stack // CK1: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]], // CK1: [[SIZE:%.+]] = load i64, i64* [[KERNEL_SIZE]], -// CK1: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 [[SIZE]], i16 [[SHARED_MEM_FLAG]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// CK1: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 [[SIZE]], i16 [[SHARED_MEM_FLAG]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) // CK1: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], // CK1: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i64 0 // CK1: [[GLOBALSTACK2:%.+]] = bitcast i8* [[GLOBALSTACK]] to %struct._globalized_locals_ty* @@ -76,7 +76,7 @@ void test_ds(){ // CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) // CK1: call void @__kmpc_end_sharing_variables() // CK1: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// CK1: call void @__kmpc_restore_team_static_memory(i16 [[SHARED_MEM_FLAG]]) +// CK1: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[SHARED_MEM_FLAG]]) // CK1: call void @__kmpc_kernel_deinit(i16 1) /// ========= In the data sharing wrapper function ========= /// diff --git a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp index 4e763bd139..d9056eeff5 100644 --- a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -30,7 +30,7 @@ int main(int argc, char **argv) { // CHECK-LABEL: define internal void @__omp_offloading_{{.*}}_main_l17_worker( // CHECK: define weak void @__omp_offloading_{{.*}}_main_l17([10 x i32]* dereferenceable(40) %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}) -// CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 84, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// CHECK: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 84, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) // CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], // CHECK: [[STACK:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty* // CHECK: [[ARGC:%.+]] = load i32, i32* %{{.+}}, align @@ -46,7 +46,7 @@ int main(int argc, char **argv) { // CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @ -// CHECK: call void @__kmpc_restore_team_static_memory(i16 1) +// CHECK: call void @__kmpc_restore_team_static_memory(i16 0, i16 1) // CHECK: define internal void [[PARALLEL]]( // CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack( diff --git a/test/OpenMP/nvptx_parallel_codegen.cpp b/test/OpenMP/nvptx_parallel_codegen.cpp index 21a616d091..04089ce3f5 100644 --- a/test/OpenMP/nvptx_parallel_codegen.cpp +++ b/test/OpenMP/nvptx_parallel_codegen.cpp @@ -330,7 +330,7 @@ int bar(int n){ // CHECK-64: [[CONV:%.+]] = bitcast i64* [[A_ADDR]] to i32* // CHECK: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], // CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], -// CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// CHECK: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) // CHECK: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], // CHECK: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 // CHECK: [[BC:%.+]] = bitcast i8* [[STACK]] to %struct._globalized_locals_ty* @@ -339,7 +339,7 @@ int bar(int n){ // CHECK: [[GLOBAL_A_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CHECK: store i32 [[A]], i32* [[GLOBAL_A_ADDR]], // CHECK: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// CHECK: call void @__kmpc_restore_team_static_memory(i16 [[IS_SHARED]]) +// CHECK: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[IS_SHARED]]) // CHECK-LABEL: define internal void @{{.+}}(i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.*}}) // CHECK: [[CC:%.+]] = alloca i32, diff --git a/test/OpenMP/nvptx_parallel_for_codegen.cpp b/test/OpenMP/nvptx_parallel_for_codegen.cpp index 92783d6085..1446ba50ce 100644 --- a/test/OpenMP/nvptx_parallel_for_codegen.cpp +++ b/test/OpenMP/nvptx_parallel_for_codegen.cpp @@ -47,7 +47,7 @@ int bar(int n){ // CHECK: call void @__kmpc_data_sharing_init_stack() // CHECK: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], // CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], -// CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 %7, i16 %6, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// CHECK: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 %7, i16 %6, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) // CHECK: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], // CHECK: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 // CHECK: call void @__kmpc_kernel_prepare_parallel( @@ -56,7 +56,7 @@ int bar(int n){ // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) // CHECK: call void @__kmpc_end_sharing_variables() // CHECK: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], -// CHECK: call void @__kmpc_restore_team_static_memory(i16 [[IS_SHARED]]) +// CHECK: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[IS_SHARED]]) // CHECK: call void @__kmpc_kernel_deinit(i16 1) // CHECK: define internal void @__omp_outlined__( diff --git a/test/OpenMP/nvptx_target_codegen.cpp b/test/OpenMP/nvptx_target_codegen.cpp index ff44c0e8fb..b05ee9dee6 100644 --- a/test/OpenMP/nvptx_target_codegen.cpp +++ b/test/OpenMP/nvptx_target_codegen.cpp @@ -37,7 +37,7 @@ struct TT{ // CHECK: store i32** [[PTR2_REF]], i32*** [[PTR2_REF_PTR:%.+]], // CHECK: [[PTR2_REF:%.+]] = load i32**, i32*** [[PTR2_REF_PTR]], // CHECK: call void @__kmpc_kernel_init( -// CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MAP_TY]], [[MAP_TY]] addrspace(3)* @{{.+}}, i32 0, i32 0, i32 0) to i8*), i{{64|32}} %{{.+}}, i16 %{{.+}}, i8** addrspacecast (i8* addrspace(3)* [[BUF_PTR:@.+]] to i8**)) +// CHECK: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MAP_TY]], [[MAP_TY]] addrspace(3)* @{{.+}}, i32 0, i32 0, i32 0) to i8*), i{{64|32}} %{{.+}}, i16 %{{.+}}, i8** addrspacecast (i8* addrspace(3)* [[BUF_PTR:@.+]] to i8**)) // CHECK: [[BUF:%.+]] = load i8*, i8* addrspace(3)* [[BUF_PTR]], // CHECK: [[BUF_OFFS:%.+]] = getelementptr inbounds i8, i8* [[BUF]], i{{[0-9]+}} 0 // CHECK: [[BUF:%.+]] = bitcast i8* [[BUF_OFFS]] to [[GLOB_TY]]* diff --git a/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp index 34ad93b695..4f06f33e68 100644 --- a/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp +++ b/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp @@ -72,7 +72,7 @@ int bar(int n){ // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8* // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1 // CHECK: br i1 [[CMP]], label @@ -273,7 +273,7 @@ int bar(int n){ // CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8* // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1 // CHECK: br i1 [[CMP]], label // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align @@ -560,7 +560,7 @@ int bar(int n){ // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8* // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) // CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1 // CHECK: br i1 [[CMP]], label diff --git a/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp b/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp index cf007427ec..3a0e513827 100644 --- a/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp +++ b/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp @@ -62,7 +62,7 @@ int bar(int n){ // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK: [[MTMP1:%.+]] = sub nuw i32 [[MNTH]], [[MWS]] // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] - // CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* @{{.+}}, i32 0, i32 0, i32 0) to i8*), i{{64|32}} 4, i16 1, i8** addrspacecast (i8* addrspace(3)* [[BUF:@.+]] to i8**)) + // CHECK: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* @{{.+}}, i32 0, i32 0, i32 0) to i8*), i{{64|32}} 4, i16 1, i8** addrspacecast (i8* addrspace(3)* [[BUF:@.+]] to i8**)) // CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[BUF]], // CHECK: [[RD:%.+]] = bitcast i8* [[PTR]] to [[GLOB_TY:%.+]]* // CHECK: [[I_ADDR:%.+]] = getelementptr inbounds [[GLOB_TY]], [[GLOB_TY]]* [[RD]], i32 0, i32 0 diff --git a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp index 2ecb9ca0e2..c4df348329 100644 --- a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -83,14 +83,14 @@ int bar(int n){ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l32( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) -// CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 4, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 4, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) // CHECK: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], // CHECK: [[BC:%.+]] = bitcast i8* [[TEAM_ALLOC]] to [[REC:%.+]]* // CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: {{call|invoke}} void [[OUTL1:@.+]]( // CHECK: call void @__kmpc_for_static_fini( -// CHECK: call void @__kmpc_restore_team_static_memory(i16 1) +// CHECK: call void @__kmpc_restore_team_static_memory(i16 1, i16 1) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void diff --git a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp index c4db1bc579..21fb46fc51 100644 --- a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp +++ b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp @@ -71,14 +71,14 @@ int bar(int n){ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l30( // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0) -// CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 4, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 4, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) // CHECK: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], // CHECK: [[BC:%.+]] = bitcast i8* [[TEAM_ALLOC]] to [[REC:%.+]]* // CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0 // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: {{call|invoke}} void [[OUTL1:@.+]]( // CHECK: call void @__kmpc_for_static_fini( -// CHECK: call void @__kmpc_restore_team_static_memory(i16 1) +// CHECK: call void @__kmpc_restore_team_static_memory(i16 1, i16 1) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0) // CHECK: ret void diff --git a/test/OpenMP/nvptx_teams_codegen.cpp b/test/OpenMP/nvptx_teams_codegen.cpp index b259a4b938..2d1ab9f063 100644 --- a/test/OpenMP/nvptx_teams_codegen.cpp +++ b/test/OpenMP/nvptx_teams_codegen.cpp @@ -46,7 +46,7 @@ int main (int argc, char **argv) { // CK1-64: [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}* // CK1: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]], // CK1: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]], -// CK1: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// CK1: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) // CK1: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], // CK1: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 // CK1-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]] @@ -67,7 +67,7 @@ int main (int argc, char **argv) { // CK1: store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]] // CK1: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]], // CK1: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]], -// CK1: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// CK1: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) // CK1: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], // CK1: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 // CK1: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]] @@ -137,7 +137,7 @@ int main (int argc, char **argv) { // CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32* // CK2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]], // CK2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]], -// CK2: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// CK2: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) // CK2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], // CK2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 // CK2-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]] @@ -162,7 +162,7 @@ int main (int argc, char **argv) { // CK2: store i{{[0-9]+}}** [[ARGC]], i{{[0-9]+}}*** [[ARGCADDR]], // CK2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]], // CK2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]], -// CK2: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// CK2: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) // CK2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], // CK2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0 // CK2: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]] diff --git a/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/test/OpenMP/nvptx_teams_reduction_codegen.cpp index 818f0739b2..b0246e2ee8 100644 --- a/test/OpenMP/nvptx_teams_reduction_codegen.cpp +++ b/test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -137,7 +137,7 @@ int bar(int n){ // // CHECK: call void @__kmpc_spmd_kernel_init( // CHECK: call void @__kmpc_data_sharing_init_stack_spmd() - // CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY:%.+]], %{{.+}} addrspace(3)* [[KERNEL_RD:@.+]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} {{8|16}}, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR:@.+]] to i8**)) + // CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY:%.+]], %{{.+}} addrspace(3)* [[KERNEL_RD:@.+]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} {{8|16}}, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR:@.+]] to i8**)) // CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], // CHECK: [[GLOBAL_REC:%.+]] = bitcast i8* [[PTR]] to [[GLOB_REC_TY:%.+]]* // CHECK-DAG: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOB_REC_TY]], [[GLOB_REC_TY]]* [[GLOBAL_REC]], i32 0, i32 0 @@ -176,7 +176,7 @@ int bar(int n){ // CHECK: br label %[[EXIT]] // // CHECK: [[EXIT]] - // call void @__kmpc_restore_team_static_memory(i16 1) + // CHECK: call void @__kmpc_restore_team_static_memory(i16 1, i16 1) // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable{{.+}}, i16* dereferenceable{{.+}}) @@ -210,7 +210,7 @@ int bar(int n){ // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8* // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* - // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[PAR_SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[PAR_WARP_COPY_FN:@.+]]) + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* [[LOC]], i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[PAR_SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[PAR_WARP_COPY_FN:@.+]]) // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] // -- 2.40.0