From: Artem Belevich Date: Wed, 29 Jun 2016 20:51:15 +0000 (+0000) Subject: Revert r273313 "[NVPTX] Improve lowering of byval args of device functions." X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a872e23126619aff3a4ad128c0794dd150b7a004;p=llvm Revert r273313 "[NVPTX] Improve lowering of byval args of device functions." The change causes llvm crash in some unoptimized builds. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@274163 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index c3b6f7b034b..61c6758ef11 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -662,58 +662,6 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { TM.is64Bit() ? NVPTX::cvta_to_local_yes_64 : NVPTX::cvta_to_local_yes; break; case ADDRESS_SPACE_PARAM: - if (Src.getOpcode() == NVPTXISD::MoveParam) { - // TL;DR: addrspacecast MoveParam to param space is a no-op. - // - // Longer version is that this particular change is both an - // optimization and a work-around for a problem in ptxas for - // sm_50+. - // - // Background: - // According to PTX ISA v7.5 5.1.6.4: "...whenever a formal - // parameter has its address taken within the called function - // [..] the parameter will be copied to the stack if - // necessary, and so the address will be in the .local state - // space and is accessed via ld.local and st.local - // instructions." - // - // Bug part: 'copied to the stack if necessary' part is - // currently broken for byval arguments with alignment < 4 for - // sm_50+ in all public versions of CUDA. Taking the address of - // such an argument results in SASS code that attempts to store - // the value using a 32-bit write to an unaligned address. - // - // Optimization: On top of the overhead of spill-to-stack, we - // also convert that address from local to generic space, - // which may result in further overhead. All of it is in most - // cases unnecessary, as we only need the value of the - // variable, and it can be accessed directly by using the - // argument symbol. We'll use the address of the local copy if - // it's needed (see step (a) below). - // - // In order for this bit to do its job we need to: - // - // a) Let LLVM do the spilling and make sure the IR does not - // access byval arguments directly. Instead, the argument - // pointer (which is in the default address space) is - // addrspacecast'ed to param space, and the data it points to - // is copied to allocated local space (i.e. we let compiler - // spill it, which gives us an opportunity to optimize the - // spill away later if nobody needs its address). Then all - // uses of arg pointer are replaced with a pointer to local - // copy of the arg. All this is done in NVPTXLowerKernelArgs. - // - // b) LowerFormalArguments() lowers the argument to - // NVPTXISD::MoveParam without any space conversion. - // - // c) And the final step is done by the code below. - // It replaces the addrspacecast (MoveParam) from step (a) - // with the arg symbol itself. This can then be used for - // [symbol + offset] addressing. - - ReplaceNode(N, Src.getOperand(0).getNode()); - return; - } Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64 : NVPTX::nvvm_ptr_gen_to_param; break; diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index 7d457aff46c..e81f824b2f3 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2078,6 +2078,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( SDValue Root = DAG.getRoot(); std::vector OutChains; + bool isKernel = llvm::isKernelFunction(*F); bool isABI = (STI.getSmVersion() >= 20); assert(isABI && "Non-ABI compilation is not supported"); if (!isABI) @@ -2111,8 +2112,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( theArgs[i], (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent() : nullptr))) { - assert(llvm::isKernelFunction(*F) && - "Only kernels can have image/sampler params"); + assert(isKernel && "Only kernels can have image/sampler params"); InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32)); continue; } @@ -2334,11 +2334,6 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( // machine instruction fails because TargetExternalSymbol // (not lowered) is target dependent, and CopyToReg assumes // the source is lowered. - // - // Byval arguments for regular function are lowered the same way - // as for kernels in order to generate better code and work around - // a known issue in ptxas. See comments in - // NVPTXDAGToDAGISel::SelectAddrSpaceCast() for the gory details. EVT ObjectVT = getValueType(DL, Ty); assert(ObjectVT == Ins[InsIdx].VT && "Ins type did not match function type"); @@ -2346,7 +2341,14 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg); if (p.getNode()) p.getNode()->setIROrder(idx + 1); - InVals.push_back(p); + if (isKernel) + InVals.push_back(p); + else { + SDValue p2 = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT, + DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, dl, MVT::i32), p); + InVals.push_back(p2); + } } // Clang will check explicit VarArg and issue error if any. However, Clang diff --git a/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp b/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp index a4769b319a0..d162a283f74 100644 --- a/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp +++ b/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp @@ -101,11 +101,6 @@ namespace { class NVPTXLowerKernelArgs : public FunctionPass { bool runOnFunction(Function &F) override; - // Kernels and regular device functions treat byval arguments - // differently. - bool runOnKernelFunction(Function &F); - bool runOnDeviceFunction(Function &F); - // handle byval parameters void handleByValParam(Argument *Arg); // Knowing Ptr must point to the global address space, this function @@ -197,7 +192,11 @@ void NVPTXLowerKernelArgs::markPointerAsGlobal(Value *Ptr) { // ============================================================================= // Main function for this pass. // ============================================================================= -bool NVPTXLowerKernelArgs::runOnKernelFunction(Function &F) { +bool NVPTXLowerKernelArgs::runOnFunction(Function &F) { + // Skip non-kernels. See the comments at the top of this file. + if (!isKernelFunction(F)) + return false; + if (TM && TM->getDrvInterface() == NVPTX::CUDA) { // Mark pointers in byval structs as global. for (auto &B : F) { @@ -229,19 +228,6 @@ bool NVPTXLowerKernelArgs::runOnKernelFunction(Function &F) { return true; } -// See comments in NVPTXDAGToDAGISel::SelectAddrSpaceCast() for the -// details on why we need to spill byval arguments into local memory. -bool NVPTXLowerKernelArgs::runOnDeviceFunction(Function &F) { - for (Argument &Arg : F.args()) - if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) - handleByValParam(&Arg); - return true; -} - -bool NVPTXLowerKernelArgs::runOnFunction(Function &F) { - return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F); -} - FunctionPass * llvm::createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM) { return new NVPTXLowerKernelArgs(TM); diff --git a/test/CodeGen/NVPTX/bug21465.ll b/test/CodeGen/NVPTX/bug21465.ll index acd3cee9848..2eae41f73a0 100644 --- a/test/CodeGen/NVPTX/bug21465.ll +++ b/test/CodeGen/NVPTX/bug21465.ll @@ -15,7 +15,7 @@ entry: %b = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1 %0 = load i32, i32* %b, align 4 ; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}] -; PTX: ld.param.u32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4] +; PTX: ld.param.u32 [[value:%r[0-9]+]], [{{%rd[0-9]+}}+4] store i32 %0, i32* %output, align 4 ; PTX-NEXT: st.global.u32 [{{%rd[0-9]+}}], [[value]] ret void diff --git a/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll b/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll index bdb1d3c546c..2fffa3eeac1 100644 --- a/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll +++ b/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll @@ -28,38 +28,20 @@ define void @kernel2(float addrspace(1)* %input, float addrspace(1)* %output) { %struct.S = type { i32*, i32* } -define void @ptr_in_byval_kernel(%struct.S* byval %input, i32* %output) { -; CHECK-LABEL: .visible .entry ptr_in_byval_kernel( -; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_kernel_param_1] -; CHECK: cvta.to.global.u64 %[[optr_g:.*]], %[[optr]]; -; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8] -; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]]; - %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1 - %b = load i32*, i32** %b_ptr, align 4 - %v = load i32, i32* %b, align 4 -; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]] - store i32 %v, i32* %output, align 4 -; CHECK: st.global.u32 [%[[optr_g]]], %[[val]] - ret void -} - -; Regular functions lower byval arguments differently. We need to make -; sure that we're loading byval argument data using [symbol+offset]. -; There's also no assumption that all pointers within are in global space. -define void @ptr_in_byval_func(%struct.S* byval %input, i32* %output) { -; CHECK-LABEL: .visible .func ptr_in_byval_func( -; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_func_param_1] -; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_func_param_0+8] +define void @ptr_in_byval(%struct.S* byval %input, i32* %output) { +; CHECK-LABEL: .visible .entry ptr_in_byval( +; CHECK: cvta.to.global.u64 +; CHECK: cvta.to.global.u64 %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1 %b = load i32*, i32** %b_ptr, align 4 %v = load i32, i32* %b, align 4 -; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]] +; CHECK: ld.global.u32 store i32 %v, i32* %output, align 4 -; CHECK: st.u32 [%[[optr]]], %[[val]] +; CHECK: st.global.u32 ret void } !nvvm.annotations = !{!0, !1, !2} !0 = !{void (float*, float*)* @kernel, !"kernel", i32 1} !1 = !{void (float addrspace(1)*, float addrspace(1)*)* @kernel2, !"kernel", i32 1} -!2 = !{void (%struct.S*, i32*)* @ptr_in_byval_kernel, !"kernel", i32 1} +!2 = !{void (%struct.S*, i32*)* @ptr_in_byval, !"kernel", i32 1}