From: Alexey Bataev <a.bataev@hotmail.com>
Date: Tue, 20 Mar 2018 15:41:05 +0000 (+0000)
Subject: [OPENMP, NVPTX] Codegen for target distribute parallel combined
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2c97b35e76515feb5629bbf6ed16ffd5da257c50;p=clang

[OPENMP, NVPTX] Codegen for target distribute parallel combined
constructs in generic mode.

Fixed codegen for distribute parallel combined constructs. We have to
pass and read the shared lower and upper bound from the distribute
region in the inner parallel region. Patch is for generic mode.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@327990 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index ccffa9cc4a..8ab890cb4c 100644
--- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -1444,7 +1444,11 @@ void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
       for (llvm::Value *V : CapturedVars) {
         Address Dst = Bld.CreateConstInBoundsGEP(
             SharedArgListAddress, Idx, CGF.getPointerSize());
-        llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy);
+        llvm::Value * PtrV;
+        if (V->getType()->isIntegerTy())
+          PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
+        else
+          PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
         CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
                               Ctx.getPointerType(Ctx.VoidPtrTy));
         ++Idx;
@@ -2963,22 +2967,56 @@ llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
 
   // Retrieve the shared variables from the list of references returned
   // by the runtime. Pass the variables to the outlined function.
+  Address SharedArgListAddress = Address::invalid();
+  if (CS.capture_size() > 0 ||
+      isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
+    SharedArgListAddress = CGF.EmitLoadOfPointer(
+        GlobalArgs, CGF.getContext()
+                        .getPointerType(CGF.getContext().getPointerType(
+                            CGF.getContext().VoidPtrTy))
+                        .castAs<PointerType>());
+  }
+  unsigned Idx = 0;
+  if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
+    Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
+                                             CGF.getPointerSize());
+    Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
+        Src, CGF.SizeTy->getPointerTo());
+    llvm::Value *LB = CGF.EmitLoadOfScalar(
+        TypedAddress,
+        /*Volatile=*/false,
+        CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
+        cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
+    Args.emplace_back(LB);
+    ++Idx;
+    Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
+                                     CGF.getPointerSize());
+    TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
+        Src, CGF.SizeTy->getPointerTo());
+    llvm::Value *UB = CGF.EmitLoadOfScalar(
+        TypedAddress,
+        /*Volatile=*/false,
+        CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
+        cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
+    Args.emplace_back(UB);
+    ++Idx;
+  }
   if (CS.capture_size() > 0) {
     ASTContext &CGFContext = CGF.getContext();
-    Address SharedArgListAddress = CGF.EmitLoadOfPointer(GlobalArgs,
-        CGFContext
-            .getPointerType(CGFContext.getPointerType(CGFContext.VoidPtrTy))
-            .castAs<PointerType>());
     for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
       QualType ElemTy = CurField->getType();
-      Address Src = Bld.CreateConstInBoundsGEP(
-          SharedArgListAddress, I, CGF.getPointerSize());
-      Address TypedAddress = Bld.CreateBitCast(
+      Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx,
+                                               CGF.getPointerSize());
+      Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
           Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
       llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
                                               /*Volatile=*/false,
                                               CGFContext.getPointerType(ElemTy),
                                               CI->getLocation());
+      if (CI->capturesVariableByCopy()) {
+        Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
+                              CI->getLocation());
+      }
       Args.emplace_back(Arg);
     }
   }
diff --git a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
index fc3f25355e..73647c4112 100644
--- a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
+++ b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
@@ -22,7 +22,7 @@ tx ftemplate(int n) {
   tx a[N];
   short aa[N];
   tx b[10];
-  tx c[M][M];  
+  tx c[M][M];
   tx f = n;
   tx l;
   int k;
@@ -47,7 +47,7 @@ tx ftemplate(int n) {
   for(int i = 0; i < M; i++) {
     for(int j = 0; j < M; j++) {
       k = M;
-      c[i][j] = i+j*f+k;      
+      c[i][j] = i + j * f + k;
     }
   }
 
diff --git a/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
new file mode 100644
index 0000000000..af72f3be8f
--- /dev/null
+++ b/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
@@ -0,0 +1,98 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+int a;
+
+int foo(int *a);
+
+int main(int argc, char **argv) {
+#pragma omp target teams distribute parallel for map(tofrom:a) if(parallel:argc)
+  for (int i= 0; i < argc; ++i)
+    a = foo(&i) + foo(&a) + foo(&argc);
+  return 0;
+}
+
+// CHECK: define internal void @__omp_offloading_{{.*}}_main_l[[@LINE-6]]_worker()
+// CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* @
+// CHECK: call void [[PARALLEL:@.+]]_wrapper(i16 0, i32 [[TID]])
+
+// CHECK: define void @__omp_offloading_{{.*}}_main_l[[@LINE-10]](i{{64|32}} %{{[^,].*}}, i32* dereferenceable{{[^,]*}}, i{{64|32}} %{{[^,)]*}})
+// CHECK: [[TID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* @
+// CHECK: call void @__kmpc_kernel_init(
+// CHECK: call void @__kmpc_data_sharing_init_stack()
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call void @__kmpc_kernel_prepare_parallel(
+// CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[BUF_PTR_PTR:%[^,]+]], i{{64|32}} 4)
+// CHECK: [[BUF_PTR:%.+]] = load i8**, i8*** [[BUF_PTR_PTR]],
+// CHECK: [[LB:%.+]] = inttoptr i{{64|32}} [[LB_:%.*]] to i8*
+// CHECK: store i8* [[LB]], i8** [[BUF_PTR]],
+// CHECK: [[BUF_PTR1:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 1
+// CHECK: [[UB:%.+]] = inttoptr i{{64|32}} [[UB_:%.*]] to i8*
+// CHECK: store i8* [[UB]], i8** [[BUF_PTR1]],
+// CHECK: [[BUF_PTR2:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 2
+// CHECK: [[ARGC:%.+]] = inttoptr i{{64|32}} [[ARGC_:%.*]] to i8*
+// CHECK: store i8* [[ARGC]], i8** [[BUF_PTR2]],
+// CHECK: [[BUF_PTR3:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 3
+// CHECK: [[A_PTR:%.+]] = bitcast i32* [[A_ADDR:%.*]] to i8*
+// CHECK: store i8* [[A_PTR]], i8** [[BUF_PTR3]],
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: call void @__kmpc_end_sharing_variables()
+// CHECK: br label
+
+// CHECK: call void @__kmpc_serialized_parallel(%ident_t* @
+// CHECK: [[GTID_ADDR:%.*]] = load i32*, i32** %
+// CHECK: call void [[PARALLEL]](i32* [[GTID_ADDR]], i32* %{{.+}}, i{{64|32}} [[LB_]], i{{64|32}} [[UB_]], i{{64|32}} [[ARGC_]], i32* [[A_ADDR]])
+// CHECK: call void @__kmpc_end_serialized_parallel(%ident_t* @
+// CHECK: br label %
+
+
+// CHECK: call void @__kmpc_for_static_fini(%ident_t* @
+
+// CHECK: call void @__kmpc_kernel_deinit(i16 1)
+// CHECK: call void @llvm.nvvm.barrier0()
+
+// CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.+}}, i{{64|32}} %{{.+}}, i32* dereferenceable{{.*}})
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 8, i16 0)
+// CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_TY:%.+]]*
+// CHECK: [[I:%.+]] = getelementptr inbounds [[GLOBAL_TY]], [[GLOBAL_TY]]* [[GLOBALS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[ARGC_VAL:%.+]] = load i32, i32* %
+// CHECK: [[ARGC:%.+]] = getelementptr inbounds [[GLOBAL_TY]], [[GLOBAL_TY]]* [[GLOBALS]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32 [[ARGC_VAL]], i32* [[ARGC]],
+
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: call i32 [[FOO:@.+foo.+]](i32* [[I]])
+// CHECK: call i32 [[FOO]](i32* %{{.+}})
+// CHECK: call i32 [[FOO]](i32* [[ARGC]])
+// CHECK: call void @__kmpc_for_static_fini(
+
+// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[RES]])
+
+// define internal void [[PARALLEL]]_wrapper(i16 zeroext, i32)
+// CHECK: call void @__kmpc_get_shared_variables(i8*** [[BUF_PTR_PTR:%.+]])
+// CHECK: [[BUF_PTR:%.+]] = load i8**, i8*** [[BUF_PTR_PTR]],
+// CHECK: [[BUF_PTR0:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 0
+// CHECK: [[LB_PTR:%.+]] = bitcast i8** [[BUF_PTR0]] to i{{64|32}}*
+// CHECK: [[LB:%.+]] = load i{{64|32}}, i{{64|32}}* [[LB_PTR]],
+// CHECK: [[BUF_PTR1:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 1
+// CHECK: [[UB_PTR:%.+]] = bitcast i8** [[BUF_PTR1]] to i{{64|32}}*
+// CHECK: [[UB:%.+]] = load i{{64|32}}, i{{64|32}}* [[UB_PTR]],
+// CHECK: [[BUF_PTR2:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 2
+// CHECK: [[ARGC_ADDR:%.+]] = bitcast i8** [[BUF_PTR2]] to i32*
+// CHECK: [[ARGC:%.+]] = load i32, i32* [[ARGC_ADDR]],
+// CHECK-64: [[ARGC_CAST:%.+]] = zext i32 [[ARGC]] to i64
+// CHECK: [[BUF_PTR3:%.+]] = getelementptr inbounds i8*, i8** [[BUF_PTR]], i{{[0-9]+}} 3
+// CHECK: [[A_ADDR_REF:%.+]] = bitcast i8** [[BUF_PTR3]] to i32**
+// CHECK: [[A_ADDR:%.+]] = load i32*, i32** [[A_ADDR_REF]],
+// CHECK-64: call void [[PARALLEL]](i32* %{{.+}}, i32* %{{.+}}, i64 [[LB]], i64 [[UB]], i64 [[ARGC_CAST]], i32* [[A_ADDR]])
+// CHECK-32: call void [[PARALLEL]](i32* %{{.+}}, i32* %{{.+}}, i32 [[LB]], i32 [[UB]], i32 [[ARGC]], i32* [[A_ADDR]])
+// CHECK: ret void
+
+#endif