From: Scott Linder Date: Fri, 3 Aug 2018 15:50:52 +0000 (+0000) Subject: [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=77acb45e8012b27a895254f5518c1528a5b4b0cf;p=clang [OpenCL] Always emit alloca in entry block for enqueue_kernel builtin Ensures the statically sized alloca is not converted to DYNAMIC_STACKALLOC later because it is not in the entry block. Differential Revision: https://reviews.llvm.org/D50104 git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@338899 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp index c1959bdf17..97fda7779b 100644 --- a/lib/CodeGen/CGBuiltin.cpp +++ b/lib/CodeGen/CGBuiltin.cpp @@ -3338,23 +3338,29 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, // Create a temporary array to hold the sizes of local pointer arguments // for the block. \p First is the position of the first size argument. auto CreateArrayForSizeVar = [=](unsigned First) { - auto *AT = llvm::ArrayType::get(SizeTy, NumArgs - First); - auto *Arr = Builder.CreateAlloca(AT); - llvm::Value *Ptr; + llvm::APInt ArraySize(32, NumArgs - First); + QualType SizeArrayTy = getContext().getConstantArrayType( + getContext().getSizeType(), ArraySize, ArrayType::Normal, + /*IndexTypeQuals=*/0); + auto Tmp = CreateMemTemp(SizeArrayTy, "block_sizes"); + llvm::Value *TmpPtr = Tmp.getPointer(); + llvm::Value *TmpSize = EmitLifetimeStart( + CGM.getDataLayout().getTypeAllocSize(Tmp.getElementType()), TmpPtr); + llvm::Value *ElemPtr; // Each of the following arguments specifies the size of the corresponding // argument passed to the enqueued block. auto *Zero = llvm::ConstantInt::get(IntTy, 0); for (unsigned I = First; I < NumArgs; ++I) { auto *Index = llvm::ConstantInt::get(IntTy, I - First); - auto *GEP = Builder.CreateGEP(Arr, {Zero, Index}); + auto *GEP = Builder.CreateGEP(TmpPtr, {Zero, Index}); if (I == First) - Ptr = GEP; + ElemPtr = GEP; auto *V = Builder.CreateZExtOrTrunc(EmitScalarExpr(E->getArg(I)), SizeTy); Builder.CreateAlignedStore( V, GEP, CGM.getDataLayout().getPrefTypeAlignment(SizeTy)); } - return Ptr; + return std::tie(ElemPtr, TmpSize, TmpPtr); }; // Could have events and/or varargs. @@ -3366,24 +3372,27 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, llvm::Value *Kernel = Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy); auto *Block = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy); - auto *PtrToSizeArray = CreateArrayForSizeVar(4); + llvm::Value *ElemPtr, *TmpSize, *TmpPtr; + std::tie(ElemPtr, TmpSize, TmpPtr) = CreateArrayForSizeVar(4); // Create a vector of the arguments, as well as a constant value to // express to the runtime the number of variadic arguments. std::vector Args = { Queue, Flags, Range, Kernel, Block, ConstantInt::get(IntTy, NumArgs - 4), - PtrToSizeArray}; + ElemPtr}; std::vector ArgTys = { - QueueTy, IntTy, RangeTy, - GenericVoidPtrTy, GenericVoidPtrTy, IntTy, - PtrToSizeArray->getType()}; + QueueTy, IntTy, RangeTy, GenericVoidPtrTy, + GenericVoidPtrTy, IntTy, ElemPtr->getType()}; llvm::FunctionType *FTy = llvm::FunctionType::get( Int32Ty, llvm::ArrayRef(ArgTys), false); - return RValue::get( - Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name), - llvm::ArrayRef(Args))); + auto Call = + RValue::get(Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name), + llvm::ArrayRef(Args))); + if (TmpSize) + EmitLifetimeEnd(TmpSize, TmpPtr); + return Call; } // Any calls now have event arguments passed. if (NumArgs >= 7) { @@ -3430,15 +3439,19 @@ RValue CodeGenFunction::EmitBuiltinExpr(const FunctionDecl *FD, ArgTys.push_back(Int32Ty); Name = "__enqueue_kernel_events_varargs"; - auto *PtrToSizeArray = CreateArrayForSizeVar(7); - Args.push_back(PtrToSizeArray); - ArgTys.push_back(PtrToSizeArray->getType()); + llvm::Value *ElemPtr, *TmpSize, *TmpPtr; + std::tie(ElemPtr, TmpSize, TmpPtr) = CreateArrayForSizeVar(7); + Args.push_back(ElemPtr); + ArgTys.push_back(ElemPtr->getType()); llvm::FunctionType *FTy = llvm::FunctionType::get( Int32Ty, llvm::ArrayRef(ArgTys), false); - return RValue::get( - Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name), - llvm::ArrayRef(Args))); + auto Call = + RValue::get(Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name), + llvm::ArrayRef(Args))); + if (TmpSize) + EmitLifetimeEnd(TmpSize, TmpPtr); + return Call; } LLVM_FALLTHROUGH; } diff --git a/test/CodeGenOpenCL/cl20-device-side-enqueue.cl b/test/CodeGenOpenCL/cl20-device-side-enqueue.cl index d74a1dfbd4..e064a28ba0 100644 --- a/test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ b/test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -1,5 +1,6 @@ // RUN: %clang_cc1 %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B32 // RUN: %clang_cc1 %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=COMMON --check-prefix=B64 +// RUN: %clang_cc1 %s -cl-std=CL2.0 -ffake-address-space-map -O1 -emit-llvm -o - -triple "spir64-unknown-unknown" | FileCheck %s --check-prefix=CHECK-LIFETIMES #pragma OPENCL EXTENSION cl_khr_subgroups : enable @@ -46,8 +47,31 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) { // COMMON: %event_wait_list2 = alloca [1 x %opencl.clk_event_t*] clk_event_t event_wait_list2[] = {clk_event}; - // Emits block literal on stack and block kernel [[INVLK1]]. // COMMON: [[NDR:%[a-z0-9]+]] = alloca %struct.ndrange_t, align 4 + + // B32: %[[BLOCK_SIZES1:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES1:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES1:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES2:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES2:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES2:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES3:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES3:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES3:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES4:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES4:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES4:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES5:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES5:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES5:.*]] = alloca [1 x i64] + // B32: %[[BLOCK_SIZES6:.*]] = alloca [3 x i32] + // B64: %[[BLOCK_SIZES6:.*]] = alloca [3 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES6:.*]] = alloca [3 x i64] + // B32: %[[BLOCK_SIZES7:.*]] = alloca [1 x i32] + // B64: %[[BLOCK_SIZES7:.*]] = alloca [1 x i64] + // CHECK-LIFETIMES: %[[BLOCK_SIZES7:.*]] = alloca [1 x i64] + + // Emits block literal on stack and block kernel [[INVLK1]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()* @@ -73,7 +97,6 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) { // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVLK2:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), // COMMON-SAME: i8 addrspace(4)* [[BL_I8]]) - enqueue_kernel(default_queue, flags, ndrange, 2, &event_wait_list, &clk_event, ^(void) { a[i] = b[i]; @@ -82,39 +105,46 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) { // Emits global block literal [[BLG1]] and block kernel [[INVGK1]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags - // B32: %[[TMP:.*]] = alloca [1 x i32] - // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 - // B32: store i32 256, i32* %[[TMP1]], align 4 - // B64: %[[TMP:.*]] = alloca [1 x i64] - // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 - // B64: store i64 256, i64* %[[TMP1]], align 8 + // CHECK-LIFETIMES: [[LIFETIME_PTR:%[0-9]+]] = bitcast [1 x i64]* %[[BLOCK_SIZES1]] to i8* + // CHECK-LIFETIMES-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull [[LIFETIME_PTR]]) + // CHECK-LIFETIMES-NEXT: getelementptr inbounds [1 x i64], [1 x i64]* %[[BLOCK_SIZES1]], i64 0, i64 0 + // CHECK-LIFETIMES-LABEL: call i32 @__enqueue_kernel_varargs( + // CHECK-LIFETIMES-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull [[LIFETIME_PTR]]) + // B32: %[[TMP:.*]] = getelementptr [1 x i32], [1 x i32]* %[[BLOCK_SIZES1]], i32 0, i32 0 + // B32: store i32 256, i32* %[[TMP]], align 4 + // B64: %[[TMP:.*]] = getelementptr [1 x i64], [1 x i64]* %[[BLOCK_SIZES1]], i32 0, i32 0 + // B64: store i64 256, i64* %[[TMP]], align 8 // COMMON-LABEL: call i32 @__enqueue_kernel_varargs( // COMMON-SAME: %opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK1:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* [[BLG1]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, - // B32-SAME: i32* %[[TMP1]]) - // B64-SAME: i64* %[[TMP1]]) + // B32-SAME: i32* %[[TMP]]) + // B64-SAME: i64* %[[TMP]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p) { return; }, 256); + char c; // Emits global block literal [[BLG2]] and block kernel [[INVGK2]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags - // B32: %[[TMP:.*]] = alloca [1 x i32] - // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 - // B32: store i32 %{{.*}}, i32* %[[TMP1]], align 4 - // B64: %[[TMP:.*]] = alloca [1 x i64] - // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 - // B64: store i64 %{{.*}}, i64* %[[TMP1]], align 8 + // CHECK-LIFETIMES: [[LIFETIME_PTR:%[0-9]+]] = bitcast [1 x i64]* %[[BLOCK_SIZES2]] to i8* + // CHECK-LIFETIMES-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull [[LIFETIME_PTR]]) + // CHECK-LIFETIMES-NEXT: getelementptr inbounds [1 x i64], [1 x i64]* %[[BLOCK_SIZES2]], i64 0, i64 0 + // CHECK-LIFETIMES-LABEL: call i32 @__enqueue_kernel_varargs( + // CHECK-LIFETIMES-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull [[LIFETIME_PTR]]) + // B32: %[[TMP:.*]] = getelementptr [1 x i32], [1 x i32]* %[[BLOCK_SIZES2]], i32 0, i32 0 + // B32: store i32 %{{.*}}, i32* %[[TMP]], align 4 + // B64: %[[TMP:.*]] = getelementptr [1 x i64], [1 x i64]* %[[BLOCK_SIZES2]], i32 0, i32 0 + // B64: store i64 %{{.*}}, i64* %[[TMP]], align 8 // COMMON-LABEL: call i32 @__enqueue_kernel_varargs( // COMMON-SAME: %opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK2:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* [[BLG2]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, - // B32-SAME: i32* %[[TMP1]]) - // B64-SAME: i64* %[[TMP1]]) + // B32-SAME: i32* %[[TMP]]) + // B64-SAME: i64* %[[TMP]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p) { return; @@ -127,18 +157,21 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) { // COMMON: [[AD:%arraydecay[0-9]*]] = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0 // COMMON: [[WAIT_EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** [[AD]] to %opencl.clk_event_t{{.*}}* addrspace(4)* // COMMON: [[EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %clk_event to %opencl.clk_event_t{{.*}}* addrspace(4)* - // B32: %[[TMP:.*]] = alloca [1 x i32] - // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 - // B32: store i32 256, i32* %[[TMP1]], align 4 - // B64: %[[TMP:.*]] = alloca [1 x i64] - // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 - // B64: store i64 256, i64* %[[TMP1]], align 8 + // CHECK-LIFETIMES: [[LIFETIME_PTR:%[0-9]+]] = bitcast [1 x i64]* %[[BLOCK_SIZES3]] to i8* + // CHECK-LIFETIMES-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull [[LIFETIME_PTR]]) + // CHECK-LIFETIMES-NEXT: getelementptr inbounds [1 x i64], [1 x i64]* %[[BLOCK_SIZES3]], i64 0, i64 0 + // CHECK-LIFETIMES-LABEL: call i32 @__enqueue_kernel_events_varargs( + // CHECK-LIFETIMES-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull [[LIFETIME_PTR]]) + // B32: %[[TMP:.*]] = getelementptr [1 x i32], [1 x i32]* %[[BLOCK_SIZES3]], i32 0, i32 0 + // B32: store i32 256, i32* %[[TMP]], align 4 + // B64: %[[TMP:.*]] = getelementptr [1 x i64], [1 x i64]* %[[BLOCK_SIZES3]], i32 0, i32 0 + // B64: store i64 256, i64* %[[TMP]], align 8 // COMMON-LABEL: call i32 @__enqueue_kernel_events_varargs // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}} [[WAIT_EVNT]], %opencl.clk_event_t{{.*}} [[EVNT]], // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK3:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* [[BLG3]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, - // B32-SAME: i32* %[[TMP1]]) - // B64-SAME: i64* %[[TMP1]]) + // B32-SAME: i32* %[[TMP]]) + // B64-SAME: i64* %[[TMP]]) enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event, ^(local void *p) { return; @@ -151,18 +184,21 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) { // COMMON: [[AD:%arraydecay[0-9]*]] = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0 // COMMON: [[WAIT_EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** [[AD]] to %opencl.clk_event_t{{.*}}* addrspace(4)* // COMMON: [[EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %clk_event to %opencl.clk_event_t{{.*}}* addrspace(4)* - // B32: %[[TMP:.*]] = alloca [1 x i32] - // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 - // B32: store i32 %{{.*}}, i32* %[[TMP1]], align 4 - // B64: %[[TMP:.*]] = alloca [1 x i64] - // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 - // B64: store i64 %{{.*}}, i64* %[[TMP1]], align 8 + // CHECK-LIFETIMES: [[LIFETIME_PTR:%[0-9]+]] = bitcast [1 x i64]* %[[BLOCK_SIZES4]] to i8* + // CHECK-LIFETIMES-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull [[LIFETIME_PTR]]) + // CHECK-LIFETIMES-NEXT: getelementptr inbounds [1 x i64], [1 x i64]* %[[BLOCK_SIZES4]], i64 0, i64 0 + // CHECK-LIFETIMES-LABEL: call i32 @__enqueue_kernel_events_varargs( + // CHECK-LIFETIMES-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull [[LIFETIME_PTR]]) + // B32: %[[TMP:.*]] = getelementptr [1 x i32], [1 x i32]* %[[BLOCK_SIZES4]], i32 0, i32 0 + // B32: store i32 %{{.*}}, i32* %[[TMP]], align 4 + // B64: %[[TMP:.*]] = getelementptr [1 x i64], [1 x i64]* %[[BLOCK_SIZES4]], i32 0, i32 0 + // B64: store i64 %{{.*}}, i64* %[[TMP]], align 8 // COMMON-LABEL: call i32 @__enqueue_kernel_events_varargs // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK4:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* [[BLG4]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, - // B32-SAME: i32* %[[TMP1]]) - // B64-SAME: i64* %[[TMP1]]) + // B32-SAME: i32* %[[TMP]]) + // B64-SAME: i64* %[[TMP]]) enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event, ^(local void *p) { return; @@ -173,18 +209,21 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) { // Emits global block literal [[BLG5]] and block kernel [[INVGK5]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags - // B32: %[[TMP:.*]] = alloca [1 x i32] - // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 - // B32: store i32 %{{.*}}, i32* %[[TMP1]], align 4 - // B64: %[[TMP:.*]] = alloca [1 x i64] - // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 - // B64: store i64 %{{.*}}, i64* %[[TMP1]], align 8 + // CHECK-LIFETIMES: [[LIFETIME_PTR:%[0-9]+]] = bitcast [1 x i64]* %[[BLOCK_SIZES5]] to i8* + // CHECK-LIFETIMES-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull [[LIFETIME_PTR]]) + // CHECK-LIFETIMES-NEXT: getelementptr inbounds [1 x i64], [1 x i64]* %[[BLOCK_SIZES5]], i64 0, i64 0 + // CHECK-LIFETIMES-LABEL: call i32 @__enqueue_kernel_varargs( + // CHECK-LIFETIMES-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull [[LIFETIME_PTR]]) + // B32: %[[TMP:.*]] = getelementptr [1 x i32], [1 x i32]* %[[BLOCK_SIZES5]], i32 0, i32 0 + // B32: store i32 %{{.*}}, i32* %[[TMP]], align 4 + // B64: %[[TMP:.*]] = getelementptr [1 x i64], [1 x i64]* %[[BLOCK_SIZES5]], i32 0, i32 0 + // B64: store i64 %{{.*}}, i64* %[[TMP]], align 8 // COMMON-LABEL: call i32 @__enqueue_kernel_varargs // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK5:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* [[BLG5]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, - // B32-SAME: i32* %[[TMP1]]) - // B64-SAME: i64* %[[TMP1]]) + // B32-SAME: i32* %[[TMP]]) + // B64-SAME: i64* %[[TMP]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p) { return; @@ -194,26 +233,29 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) { // Emits global block literal [[BLG6]] and block kernel [[INVGK6]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags - // B32: %[[TMP:.*]] = alloca [3 x i32] - // B32: %[[TMP1:.*]] = getelementptr [3 x i32], [3 x i32]* %[[TMP]], i32 0, i32 0 - // B32: store i32 1, i32* %[[TMP1]], align 4 - // B32: %[[TMP2:.*]] = getelementptr [3 x i32], [3 x i32]* %[[TMP]], i32 0, i32 1 - // B32: store i32 2, i32* %[[TMP2]], align 4 - // B32: %[[TMP3:.*]] = getelementptr [3 x i32], [3 x i32]* %[[TMP]], i32 0, i32 2 - // B32: store i32 4, i32* %[[TMP3]], align 4 - // B64: %[[TMP:.*]] = alloca [3 x i64] - // B64: %[[TMP1:.*]] = getelementptr [3 x i64], [3 x i64]* %[[TMP]], i32 0, i32 0 - // B64: store i64 1, i64* %[[TMP1]], align 8 - // B64: %[[TMP2:.*]] = getelementptr [3 x i64], [3 x i64]* %[[TMP]], i32 0, i32 1 - // B64: store i64 2, i64* %[[TMP2]], align 8 - // B64: %[[TMP3:.*]] = getelementptr [3 x i64], [3 x i64]* %[[TMP]], i32 0, i32 2 - // B64: store i64 4, i64* %[[TMP3]], align 8 + // CHECK-LIFETIMES: [[LIFETIME_PTR:%[0-9]+]] = bitcast [3 x i64]* %[[BLOCK_SIZES6]] to i8* + // CHECK-LIFETIMES-NEXT: call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull [[LIFETIME_PTR]]) + // CHECK-LIFETIMES-NEXT: getelementptr inbounds [3 x i64], [3 x i64]* %[[BLOCK_SIZES6]], i64 0, i64 0 + // CHECK-LIFETIMES-LABEL: call i32 @__enqueue_kernel_varargs( + // CHECK-LIFETIMES-NEXT: call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull [[LIFETIME_PTR]]) + // B32: %[[TMP:.*]] = getelementptr [3 x i32], [3 x i32]* %[[BLOCK_SIZES6]], i32 0, i32 0 + // B32: store i32 1, i32* %[[TMP]], align 4 + // B32: %[[BLOCK_SIZES62:.*]] = getelementptr [3 x i32], [3 x i32]* %[[BLOCK_SIZES6]], i32 0, i32 1 + // B32: store i32 2, i32* %[[BLOCK_SIZES62]], align 4 + // B32: %[[BLOCK_SIZES63:.*]] = getelementptr [3 x i32], [3 x i32]* %[[BLOCK_SIZES6]], i32 0, i32 2 + // B32: store i32 4, i32* %[[BLOCK_SIZES63]], align 4 + // B64: %[[TMP:.*]] = getelementptr [3 x i64], [3 x i64]* %[[BLOCK_SIZES6]], i32 0, i32 0 + // B64: store i64 1, i64* %[[TMP]], align 8 + // B64: %[[BLOCK_SIZES62:.*]] = getelementptr [3 x i64], [3 x i64]* %[[BLOCK_SIZES6]], i32 0, i32 1 + // B64: store i64 2, i64* %[[BLOCK_SIZES62]], align 8 + // B64: %[[BLOCK_SIZES63:.*]] = getelementptr [3 x i64], [3 x i64]* %[[BLOCK_SIZES6]], i32 0, i32 2 + // B64: store i64 4, i64* %[[BLOCK_SIZES63]], align 8 // COMMON-LABEL: call i32 @__enqueue_kernel_varargs // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK6:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* [[BLG6]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, - // B32-SAME: i32* %[[TMP1]]) - // B64-SAME: i64* %[[TMP1]]) + // B32-SAME: i32* %[[TMP]]) + // B64-SAME: i64* %[[TMP]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p1, local void *p2, local void *p3) { return; @@ -223,18 +265,21 @@ kernel void device_side_enqueue(global int *a, global int *b, int i) { // Emits global block literal [[BLG7]] and block kernel [[INVGK7]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t*, %opencl.queue_t** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags - // B32: %[[TMP:.*]] = alloca [1 x i32] - // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 - // B32: store i32 0, i32* %[[TMP1]], align 4 - // B64: %[[TMP:.*]] = alloca [1 x i64] - // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 - // B64: store i64 4294967296, i64* %[[TMP1]], align 8 + // CHECK-LIFETIMES: [[LIFETIME_PTR:%[0-9]+]] = bitcast [1 x i64]* %[[BLOCK_SIZES7]] to i8* + // CHECK-LIFETIMES-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull [[LIFETIME_PTR]]) + // CHECK-LIFETIMES-NEXT: getelementptr inbounds [1 x i64], [1 x i64]* %[[BLOCK_SIZES7]], i64 0, i64 0 + // CHECK-LIFETIMES-LABEL: call i32 @__enqueue_kernel_varargs( + // CHECK-LIFETIMES-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull [[LIFETIME_PTR]]) + // B32: %[[TMP:.*]] = getelementptr [1 x i32], [1 x i32]* %[[BLOCK_SIZES7]], i32 0, i32 0 + // B32: store i32 0, i32* %[[TMP]], align 4 + // B64: %[[TMP:.*]] = getelementptr [1 x i64], [1 x i64]* %[[BLOCK_SIZES7]], i32 0, i32 0 + // B64: store i64 4294967296, i64* %[[TMP]], align 8 // COMMON-LABEL: call i32 @__enqueue_kernel_varargs // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK7:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32 } addrspace(1)* [[BLG7]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, - // B32-SAME: i32* %[[TMP1]]) - // B64-SAME: i64* %[[TMP1]]) + // B32-SAME: i32* %[[TMP]]) + // B64-SAME: i64* %[[TMP]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p) { return; diff --git a/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl b/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl new file mode 100644 index 0000000000..00c4755c1c --- /dev/null +++ b/test/CodeGenOpenCL/enqueue-kernel-non-entry-block.cl @@ -0,0 +1,31 @@ +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=COMMON,AMDGPU +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR32 +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -emit-llvm -o - -triple "spir64-unknown-unknown" < %s | FileCheck %s --check-prefixes=COMMON,SPIR64 +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -debug-info-kind=limited -emit-llvm -o - -triple amdgcn < %s | FileCheck %s --check-prefixes=CHECK-DEBUG + +// Check that the enqueue_kernel array temporary is in the entry block to avoid +// a dynamic alloca + +typedef struct {int a;} ndrange_t; + +kernel void test(int i) { +// COMMON-LABEL: define {{.*}} void @test +// COMMON-LABEL: entry: +// AMDGPU: %block_sizes = alloca [1 x i64] +// SPIR32: %block_sizes = alloca [1 x i32] +// SPIR64: %block_sizes = alloca [1 x i64] +// COMMON-LABEL: if.then: +// COMMON-NOT: alloca +// CHECK-DEBUG: getelementptr {{.*}} %block_sizes, {{.*}} !dbg !34 +// COMMON-LABEL: if.end + queue_t default_queue; + unsigned flags = 0; + ndrange_t ndrange; + if (i) + enqueue_kernel(default_queue, flags, ndrange, ^(local void *a) { }, 32); +} + +// Check that the temporary is scoped to the `if` + +// CHECK-DEBUG: !32 = distinct !DILexicalBlock(scope: !7, file: !1, line: 24) +// CHECK-DEBUG: !34 = !DILocation(line: 25, scope: !32)