TM.is64Bit() ? NVPTX::cvta_to_local_yes_64 : NVPTX::cvta_to_local_yes;
break;
case ADDRESS_SPACE_PARAM:
+ if (Src.getOpcode() == NVPTXISD::MoveParam) {
+ // TL;DR: addrspacecast MoveParam to param space is a no-op.
+ //
+ // Longer version is that this particular change is both an
+ // optimization and a work-around for a problem in ptxas for
+ // sm_50+.
+ //
+ // Background:
+ // According to PTX ISA v7.5 5.1.6.4: "...whenever a formal
+ // parameter has its address taken within the called function
+ // [..] the parameter will be copied to the stack if
+ // necessary, and so the address will be in the .local state
+ // space and is accessed via ld.local and st.local
+ // instructions."
+ //
+ // Bug part: 'copied to the stack if necessary' part is
+ // currently broken for byval arguments with alignment < 4 for
+ // sm_50+ in all public versions of CUDA. Taking the address of
+ // such an argument results in SASS code that attempts to store
+ // the value using a 32-bit write to an unaligned address.
+ //
+ // Optimization: On top of the overhead of spill-to-stack, we
+ // also convert that address from local to generic space,
+ // which may result in further overhead. All of it is in most
+ // cases unnecessary, as we only need the value of the
+ // variable, and it can be accessed directly by using the
+ // argument symbol. We'll use the address of the local copy if
+ // it's needed (see step (a) below).
+ //
+ // In order for this bit to do its job we need to:
+ //
+ // a) Let LLVM do the spilling and make sure the IR does not
+ // access byval arguments directly. Instead, the argument
+ // pointer (which is in the default address space) is
+ // addrspacecast'ed to param space, and the data it points to
+ // is copied to allocated local space (i.e. we let compiler
+ // spill it, which gives us an opportunity to optimize the
+ // spill away later if nobody needs its address). Then all
+ // uses of arg pointer are replaced with a pointer to local
+ // copy of the arg. All this is done in NVPTXLowerKernelArgs.
+ //
+ // b) LowerFormalArguments() lowers the argument to
+ // NVPTXISD::MoveParam without any space conversion.
+ //
+ // c) And the final step is done by the code below.
+ // It replaces the addrspacecast (MoveParam) from step (a)
+ // with the arg symbol itself. This can then be used for
+ // [symbol + offset] addressing.
+
+ ReplaceNode(N, Src.getOperand(0).getNode());
+ return;
+ }
Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
: NVPTX::nvvm_ptr_gen_to_param;
break;
SDValue Root = DAG.getRoot();
std::vector<SDValue> OutChains;
- bool isKernel = llvm::isKernelFunction(*F);
bool isABI = (STI.getSmVersion() >= 20);
assert(isABI && "Non-ABI compilation is not supported");
if (!isABI)
theArgs[i],
(theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
: nullptr))) {
- assert(isKernel && "Only kernels can have image/sampler params");
+ assert(llvm::isKernelFunction(*F) &&
+ "Only kernels can have image/sampler params");
InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
continue;
}
// machine instruction fails because TargetExternalSymbol
// (not lowered) is target dependent, and CopyToReg assumes
// the source is lowered.
+ //
+ // Byval arguments for regular function are lowered the same way
+ // as for kernels in order to generate better code and work around
+ // a known issue in ptxas. See comments in
+ // NVPTXDAGToDAGISel::SelectAddrSpaceCast() for the gory details.
EVT ObjectVT = getValueType(DL, Ty);
assert(ObjectVT == Ins[InsIdx].VT &&
"Ins type did not match function type");
SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
if (p.getNode())
p.getNode()->setIROrder(idx + 1);
- if (isKernel)
- InVals.push_back(p);
- else {
- SDValue p2 = DAG.getNode(
- ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
- DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, dl, MVT::i32), p);
- InVals.push_back(p2);
- }
+ InVals.push_back(p);
}
// Clang will check explicit VarArg and issue error if any. However, Clang
class NVPTXLowerKernelArgs : public FunctionPass {
bool runOnFunction(Function &F) override;
+ // Kernels and regular device functions treat byval arguments
+ // differently.
+ bool runOnKernelFunction(Function &F);
+ bool runOnDeviceFunction(Function &F);
+
// handle byval parameters
void handleByValParam(Argument *Arg);
// Knowing Ptr must point to the global address space, this function
// =============================================================================
// Main function for this pass.
// =============================================================================
-bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
- // Skip non-kernels. See the comments at the top of this file.
- if (!isKernelFunction(F))
- return false;
-
+bool NVPTXLowerKernelArgs::runOnKernelFunction(Function &F) {
if (TM && TM->getDrvInterface() == NVPTX::CUDA) {
// Mark pointers in byval structs as global.
for (auto &B : F) {
return true;
}
+// See comments in NVPTXDAGToDAGISel::SelectAddrSpaceCast() for the
+// details on why we need to spill byval arguments into local memory.
+bool NVPTXLowerKernelArgs::runOnDeviceFunction(Function &F) {
+ for (Argument &Arg : F.args())
+ if (Arg.getType()->isPointerTy() && Arg.hasByValAttr())
+ handleByValParam(&Arg);
+ return true;
+}
+
+bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
+ return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F);
+}
+
FunctionPass *
llvm::createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM) {
return new NVPTXLowerKernelArgs(TM);
%b = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
%0 = load i32, i32* %b, align 4
; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}]
-; PTX: ld.param.u32 [[value:%r[0-9]+]], [{{%rd[0-9]+}}+4]
+; PTX: ld.param.u32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4]
store i32 %0, i32* %output, align 4
; PTX-NEXT: st.global.u32 [{{%rd[0-9]+}}], [[value]]
ret void
%struct.S = type { i32*, i32* }
-define void @ptr_in_byval(%struct.S* byval %input, i32* %output) {
-; CHECK-LABEL: .visible .entry ptr_in_byval(
-; CHECK: cvta.to.global.u64
-; CHECK: cvta.to.global.u64
+define void @ptr_in_byval_kernel(%struct.S* byval %input, i32* %output) {
+; CHECK-LABEL: .visible .entry ptr_in_byval_kernel(
+; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_kernel_param_1]
+; CHECK: cvta.to.global.u64 %[[optr_g:.*]], %[[optr]];
+; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8]
+; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]];
+ %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
+ %b = load i32*, i32** %b_ptr, align 4
+ %v = load i32, i32* %b, align 4
+; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]]
+ store i32 %v, i32* %output, align 4
+; CHECK: st.global.u32 [%[[optr_g]]], %[[val]]
+ ret void
+}
+
+; Regular functions lower byval arguments differently. We need to make
+; sure that we're loading byval argument data using [symbol+offset].
+; There's also no assumption that all pointers within are in global space.
+define void @ptr_in_byval_func(%struct.S* byval %input, i32* %output) {
+; CHECK-LABEL: .visible .func ptr_in_byval_func(
+; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_func_param_1]
+; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_func_param_0+8]
%b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
%b = load i32*, i32** %b_ptr, align 4
%v = load i32, i32* %b, align 4
-; CHECK: ld.global.u32
+; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]]
store i32 %v, i32* %output, align 4
-; CHECK: st.global.u32
+; CHECK: st.u32 [%[[optr]]], %[[val]]
ret void
}
!nvvm.annotations = !{!0, !1, !2}
!0 = !{void (float*, float*)* @kernel, !"kernel", i32 1}
!1 = !{void (float addrspace(1)*, float addrspace(1)*)* @kernel2, !"kernel", i32 1}
-!2 = !{void (%struct.S*, i32*)* @ptr_in_byval, !"kernel", i32 1}
+!2 = !{void (%struct.S*, i32*)* @ptr_in_byval_kernel, !"kernel", i32 1}