From: Artem Belevich <tra@google.com>
Date: Tue, 21 Jun 2016 20:30:26 +0000 (+0000)
Subject: [NVPTX] Improve lowering of byval args of device functions.
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a710d0d215c6a5a6752b0573a1901be38a6a1a63;p=llvm

[NVPTX] Improve lowering of byval args of device functions.

Avoid unnecessary spills of such vars to local space on SASS level and
pointer space conversion.

Instead, make a local copy with appropriate addrspacecasts and let
LLVM optimize them away when possible.

This allows loading value of the argument using [symbol+offset]
instead of converting argument to general space pointer and using it
for indexing (which also implicitly converts param space pointer to
local space one on SASS level and triggers copying of argument into
local space in the process).

This reduces call overhead, uses less registers and reduces overall
SASS size by 2-4%.

Differential Review: http://reviews.llvm.org/D21421

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@273313 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 61c6758ef11..c3b6f7b034b 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -662,6 +662,58 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
           TM.is64Bit() ? NVPTX::cvta_to_local_yes_64 : NVPTX::cvta_to_local_yes;
       break;
     case ADDRESS_SPACE_PARAM:
+      if (Src.getOpcode() == NVPTXISD::MoveParam) {
+        // TL;DR: addrspacecast MoveParam to param space is a no-op.
+        //
+        // Longer version is that this particular change is both an
+        // optimization and a work-around for a problem in ptxas for
+        // sm_50+.
+        //
+        // Background:
+        // According to PTX ISA v7.5 5.1.6.4: "...whenever a formal
+        // parameter has its address taken within the called function
+        // [..] the parameter will be copied to the stack if
+        // necessary, and so the address will be in the .local state
+        // space and is accessed via ld.local and st.local
+        // instructions."
+        //
+        // Bug part: 'copied to the stack if necessary' part is
+        // currently broken for byval arguments with alignment < 4 for
+        // sm_50+ in all public versions of CUDA. Taking the address of
+        // such an argument results in SASS code that attempts to store
+        // the value using a 32-bit write to an unaligned address.
+        //
+        // Optimization: On top of the overhead of spill-to-stack, we
+        // also convert that address from local to generic space,
+        // which may result in further overhead. All of it is in most
+        // cases unnecessary, as we only need the value of the
+        // variable, and it can be accessed directly by using the
+        // argument symbol. We'll use the address of the local copy if
+        // it's needed (see step (a) below).
+        //
+        // In order for this bit to do its job we need to:
+        //
+        // a) Let LLVM do the spilling and make sure the IR does not
+        // access byval arguments directly. Instead, the argument
+        // pointer (which is in the default address space) is
+        // addrspacecast'ed to param space, and the data it points to
+        // is copied to allocated local space (i.e. we let compiler
+        // spill it, which gives us an opportunity to optimize the
+        // spill away later if nobody needs its address). Then all
+        // uses of arg pointer are replaced with a pointer to local
+        // copy of the arg. All this is done in NVPTXLowerKernelArgs.
+        //
+        // b) LowerFormalArguments() lowers the argument to
+        // NVPTXISD::MoveParam without any space conversion.
+        //
+        // c) And the final step is done by the code below.
+        // It replaces the addrspacecast (MoveParam) from step (a)
+        // with the arg symbol itself. This can then be used for
+        // [symbol + offset] addressing.
+
+        ReplaceNode(N, Src.getOperand(0).getNode());
+        return;
+      }
       Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
                          : NVPTX::nvvm_ptr_gen_to_param;
       break;
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 47a66358463..cdac1ac4695 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2078,7 +2078,6 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
   SDValue Root = DAG.getRoot();
   std::vector<SDValue> OutChains;
 
-  bool isKernel = llvm::isKernelFunction(*F);
   bool isABI = (STI.getSmVersion() >= 20);
   assert(isABI && "Non-ABI compilation is not supported");
   if (!isABI)
@@ -2112,7 +2111,8 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
             theArgs[i],
             (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
                                      : nullptr))) {
-      assert(isKernel && "Only kernels can have image/sampler params");
+      assert(llvm::isKernelFunction(*F) &&
+             "Only kernels can have image/sampler params");
       InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
       continue;
     }
@@ -2334,6 +2334,11 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
     // machine instruction fails because TargetExternalSymbol
     // (not lowered) is target dependent, and CopyToReg assumes
     // the source is lowered.
+    //
+    // Byval arguments for regular function are lowered the same way
+    // as for kernels in order to generate better code and work around
+    // a known issue in ptxas. See comments in
+    // NVPTXDAGToDAGISel::SelectAddrSpaceCast() for the gory details.
     EVT ObjectVT = getValueType(DL, Ty);
     assert(ObjectVT == Ins[InsIdx].VT &&
            "Ins type did not match function type");
@@ -2341,14 +2346,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
     SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
     if (p.getNode())
       p.getNode()->setIROrder(idx + 1);
-    if (isKernel)
-      InVals.push_back(p);
-    else {
-      SDValue p2 = DAG.getNode(
-          ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
-          DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, dl, MVT::i32), p);
-      InVals.push_back(p2);
-    }
+    InVals.push_back(p);
   }
 
   // Clang will check explicit VarArg and issue error if any. However, Clang
diff --git a/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp b/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
index d162a283f74..a4769b319a0 100644
--- a/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
@@ -101,6 +101,11 @@ namespace {
 class NVPTXLowerKernelArgs : public FunctionPass {
   bool runOnFunction(Function &F) override;
 
+  // Kernels and regular device functions treat byval arguments
+  // differently.
+  bool runOnKernelFunction(Function &F);
+  bool runOnDeviceFunction(Function &F);
+
   // handle byval parameters
   void handleByValParam(Argument *Arg);
   // Knowing Ptr must point to the global address space, this function
@@ -192,11 +197,7 @@ void NVPTXLowerKernelArgs::markPointerAsGlobal(Value *Ptr) {
 // =============================================================================
 // Main function for this pass.
 // =============================================================================
-bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
-  // Skip non-kernels. See the comments at the top of this file.
-  if (!isKernelFunction(F))
-    return false;
-
+bool NVPTXLowerKernelArgs::runOnKernelFunction(Function &F) {
   if (TM && TM->getDrvInterface() == NVPTX::CUDA) {
     // Mark pointers in byval structs as global.
     for (auto &B : F) {
@@ -228,6 +229,19 @@ bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
   return true;
 }
 
+// See comments in NVPTXDAGToDAGISel::SelectAddrSpaceCast() for the
+// details on why we need to spill byval arguments into local memory.
+bool NVPTXLowerKernelArgs::runOnDeviceFunction(Function &F) {
+  for (Argument &Arg : F.args())
+    if (Arg.getType()->isPointerTy() && Arg.hasByValAttr())
+      handleByValParam(&Arg);
+  return true;
+}
+
+bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
+  return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F);
+}
+
 FunctionPass *
 llvm::createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM) {
   return new NVPTXLowerKernelArgs(TM);
diff --git a/test/CodeGen/NVPTX/bug21465.ll b/test/CodeGen/NVPTX/bug21465.ll
index 2eae41f73a0..acd3cee9848 100644
--- a/test/CodeGen/NVPTX/bug21465.ll
+++ b/test/CodeGen/NVPTX/bug21465.ll
@@ -15,7 +15,7 @@ entry:
   %b = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
   %0 = load i32, i32* %b, align 4
 ; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}]
-; PTX: ld.param.u32 [[value:%r[0-9]+]], [{{%rd[0-9]+}}+4]
+; PTX: ld.param.u32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4]
   store i32 %0, i32* %output, align 4
 ; PTX-NEXT: st.global.u32 [{{%rd[0-9]+}}], [[value]]
   ret void
diff --git a/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll b/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
index 2fffa3eeac1..bdb1d3c546c 100644
--- a/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
+++ b/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
@@ -28,20 +28,38 @@ define void @kernel2(float addrspace(1)* %input, float addrspace(1)* %output) {
 
 %struct.S = type { i32*, i32* }
 
-define void @ptr_in_byval(%struct.S* byval %input, i32* %output) {
-; CHECK-LABEL: .visible .entry ptr_in_byval(
-; CHECK: cvta.to.global.u64
-; CHECK: cvta.to.global.u64
+define void @ptr_in_byval_kernel(%struct.S* byval %input, i32* %output) {
+; CHECK-LABEL: .visible .entry ptr_in_byval_kernel(
+; CHECK: ld.param.u64 	%[[optr:rd.*]], [ptr_in_byval_kernel_param_1]
+; CHECK: cvta.to.global.u64 %[[optr_g:.*]], %[[optr]];
+; CHECK: ld.param.u64 	%[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8]
+; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]];
+  %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
+  %b = load i32*, i32** %b_ptr, align 4
+  %v = load i32, i32* %b, align 4
+; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]]
+  store i32 %v, i32* %output, align 4
+; CHECK: st.global.u32 [%[[optr_g]]], %[[val]]
+  ret void
+}
+
+; Regular functions lower byval arguments differently. We need to make
+; sure that we're loading byval argument data using [symbol+offset].
+; There's also no assumption that all pointers within are in global space.
+define void @ptr_in_byval_func(%struct.S* byval %input, i32* %output) {
+; CHECK-LABEL: .visible .func ptr_in_byval_func(
+; CHECK: ld.param.u64 	%[[optr:rd.*]], [ptr_in_byval_func_param_1]
+; CHECK: ld.param.u64 	%[[iptr:rd.*]], [ptr_in_byval_func_param_0+8]
   %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
   %b = load i32*, i32** %b_ptr, align 4
   %v = load i32, i32* %b, align 4
-; CHECK: ld.global.u32
+; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]]
   store i32 %v, i32* %output, align 4
-; CHECK: st.global.u32
+; CHECK: st.u32 [%[[optr]]], %[[val]]
   ret void
 }
 
 !nvvm.annotations = !{!0, !1, !2}
 !0 = !{void (float*, float*)* @kernel, !"kernel", i32 1}
 !1 = !{void (float addrspace(1)*, float addrspace(1)*)* @kernel2, !"kernel", i32 1}
-!2 = !{void (%struct.S*, i32*)* @ptr_in_byval, !"kernel", i32 1}
+!2 = !{void (%struct.S*, i32*)* @ptr_in_byval_kernel, !"kernel", i32 1}