[NVPTX] Improve lowering of byval args of device functions.

author Artem Belevich <tra@google.com>

Tue, 21 Jun 2016 20:30:26 +0000 (20:30 +0000)

committer Artem Belevich <tra@google.com>

Tue, 21 Jun 2016 20:30:26 +0000 (20:30 +0000)
author Artem Belevich <tra@google.com>
Tue, 21 Jun 2016 20:30:26 +0000 (20:30 +0000)
committer Artem Belevich <tra@google.com>
Tue, 21 Jun 2016 20:30:26 +0000 (20:30 +0000)
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

index 61c6758ef118babe45c5d49ad173e5491a675941..c3b6f7b034b3eaff3caf4aaf4cea35515b613633 100644 (file)
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -662,6 +662,58 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
            TM.is64Bit() ? NVPTX::cvta_to_local_yes_64 : NVPTX::cvta_to_local_yes;
        break;
      case ADDRESS_SPACE_PARAM:
+      if (Src.getOpcode() == NVPTXISD::MoveParam) {
+        // TL;DR: addrspacecast MoveParam to param space is a no-op.
+        //
+        // Longer version is that this particular change is both an
+        // optimization and a work-around for a problem in ptxas for
+        // sm_50+.
+        //
+        // Background:
+        // According to PTX ISA v7.5 5.1.6.4: "...whenever a formal
+        // parameter has its address taken within the called function
+        // [..] the parameter will be copied to the stack if
+        // necessary, and so the address will be in the .local state
+        // space and is accessed via ld.local and st.local
+        // instructions."
+        //
+        // Bug part: 'copied to the stack if necessary' part is
+        // currently broken for byval arguments with alignment < 4 for
+        // sm_50+ in all public versions of CUDA. Taking the address of
+        // such an argument results in SASS code that attempts to store
+        // the value using a 32-bit write to an unaligned address.
+        //
+        // Optimization: On top of the overhead of spill-to-stack, we
+        // also convert that address from local to generic space,
+        // which may result in further overhead. All of it is in most
+        // cases unnecessary, as we only need the value of the
+        // variable, and it can be accessed directly by using the
+        // argument symbol. We'll use the address of the local copy if
+        // it's needed (see step (a) below).
+        //
+        // In order for this bit to do its job we need to:
+        //
+        // a) Let LLVM do the spilling and make sure the IR does not
+        // access byval arguments directly. Instead, the argument
+        // pointer (which is in the default address space) is
+        // addrspacecast'ed to param space, and the data it points to
+        // is copied to allocated local space (i.e. we let compiler
+        // spill it, which gives us an opportunity to optimize the
+        // spill away later if nobody needs its address). Then all
+        // uses of arg pointer are replaced with a pointer to local
+        // copy of the arg. All this is done in NVPTXLowerKernelArgs.
+        //
+        // b) LowerFormalArguments() lowers the argument to
+        // NVPTXISD::MoveParam without any space conversion.
+        //
+        // c) And the final step is done by the code below.
+        // It replaces the addrspacecast (MoveParam) from step (a)
+        // with the arg symbol itself. This can then be used for
+        // [symbol + offset] addressing.
+
+        ReplaceNode(N, Src.getOperand(0).getNode());
+        return;
+      }
        Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
                           : NVPTX::nvvm_ptr_gen_to_param;
        break;
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp

index 47a66358463accad939a12a010a546fc1c67f8f2..cdac1ac46956a80f4b9ad1c1693f2ffe738b38d3 100644 (file)
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2078,7 +2078,6 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
    SDValue Root = DAG.getRoot();
    std::vector<SDValue> OutChains;
  
-  bool isKernel = llvm::isKernelFunction(*F);
    bool isABI = (STI.getSmVersion() >= 20);
    assert(isABI && "Non-ABI compilation is not supported");
    if (!isABI)
@@ -2112,7 +2111,8 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
              theArgs[i],
              (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
                                       : nullptr))) {
-      assert(isKernel && "Only kernels can have image/sampler params");
+      assert(llvm::isKernelFunction(*F) &&
+             "Only kernels can have image/sampler params");
        InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
        continue;
      }
@@ -2334,6 +2334,11 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
      // machine instruction fails because TargetExternalSymbol
      // (not lowered) is target dependent, and CopyToReg assumes
      // the source is lowered.
+    //
+    // Byval arguments for regular function are lowered the same way
+    // as for kernels in order to generate better code and work around
+    // a known issue in ptxas. See comments in
+    // NVPTXDAGToDAGISel::SelectAddrSpaceCast() for the gory details.
      EVT ObjectVT = getValueType(DL, Ty);
      assert(ObjectVT == Ins[InsIdx].VT &&
             "Ins type did not match function type");
@@ -2341,14 +2346,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
      SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
      if (p.getNode())
        p.getNode()->setIROrder(idx + 1);
-    if (isKernel)
-      InVals.push_back(p);
-    else {
-      SDValue p2 = DAG.getNode(
-          ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
-          DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, dl, MVT::i32), p);
-      InVals.push_back(p2);
-    }
+    InVals.push_back(p);
    }
  
    // Clang will check explicit VarArg and issue error if any. However, Clang
diff --git a/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp b/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp

index d162a283f7453ddb1de83526314d1a750e55d792..a4769b319a0dcb51721b2f5dfe20729455b58302 100644 (file)
--- a/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
@@ -101,6 +101,11 @@ namespace {
  class NVPTXLowerKernelArgs : public FunctionPass {
    bool runOnFunction(Function &F) override;
  
+  // Kernels and regular device functions treat byval arguments
+  // differently.
+  bool runOnKernelFunction(Function &F);
+  bool runOnDeviceFunction(Function &F);
+
    // handle byval parameters
    void handleByValParam(Argument *Arg);
    // Knowing Ptr must point to the global address space, this function
@@ -192,11 +197,7 @@ void NVPTXLowerKernelArgs::markPointerAsGlobal(Value *Ptr) {
  // =============================================================================
  // Main function for this pass.
  // =============================================================================
-bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
-  // Skip non-kernels. See the comments at the top of this file.
-  if (!isKernelFunction(F))
-    return false;
-
+bool NVPTXLowerKernelArgs::runOnKernelFunction(Function &F) {
    if (TM && TM->getDrvInterface() == NVPTX::CUDA) {
      // Mark pointers in byval structs as global.
      for (auto &B : F) {
@@ -228,6 +229,19 @@ bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
    return true;
  }
  
+// See comments in NVPTXDAGToDAGISel::SelectAddrSpaceCast() for the
+// details on why we need to spill byval arguments into local memory.
+bool NVPTXLowerKernelArgs::runOnDeviceFunction(Function &F) {
+  for (Argument &Arg : F.args())
+    if (Arg.getType()->isPointerTy() && Arg.hasByValAttr())
+      handleByValParam(&Arg);
+  return true;
+}
+
+bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
+  return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F);
+}
+
  FunctionPass *
  llvm::createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM) {
    return new NVPTXLowerKernelArgs(TM);
diff --git a/test/CodeGen/NVPTX/bug21465.ll b/test/CodeGen/NVPTX/bug21465.ll

index 2eae41f73a0c3bd31c890c48682766928f364046..acd3cee9848e1bc52acf5893a89aa6bc01086a32 100644 (file)
--- a/test/CodeGen/NVPTX/bug21465.ll
+++ b/test/CodeGen/NVPTX/bug21465.ll
@@ -15,7 +15,7 @@ entry:
    %b = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
    %0 = load i32, i32* %b, align 4
  ; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}]
-; PTX: ld.param.u32 [[value:%r[0-9]+]], [{{%rd[0-9]+}}+4]
+; PTX: ld.param.u32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4]
    store i32 %0, i32* %output, align 4
  ; PTX-NEXT: st.global.u32 [{{%rd[0-9]+}}], [[value]]
    ret void
diff --git a/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll b/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll

index 2fffa3eeac15f4a749684ff4ea3e91a15c360156..bdb1d3c546c7a4b87b818cd9f5be6aa9b5f83ec7 100644 (file)
--- a/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
+++ b/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
@@ -28,20 +28,38 @@ define void @kernel2(float addrspace(1)* %input, float addrspace(1)* %output) {
  
  %struct.S = type { i32*, i32* }
  
-define void @ptr_in_byval(%struct.S* byval %input, i32* %output) {
-; CHECK-LABEL: .visible .entry ptr_in_byval(
-; CHECK: cvta.to.global.u64
-; CHECK: cvta.to.global.u64
+define void @ptr_in_byval_kernel(%struct.S* byval %input, i32* %output) {
+; CHECK-LABEL: .visible .entry ptr_in_byval_kernel(
+; CHECK: ld.param.u64  %[[optr:rd.*]], [ptr_in_byval_kernel_param_1]
+; CHECK: cvta.to.global.u64 %[[optr_g:.*]], %[[optr]];
+; CHECK: ld.param.u64  %[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8]
+; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]];
+  %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
+  %b = load i32*, i32** %b_ptr, align 4
+  %v = load i32, i32* %b, align 4
+; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]]
+  store i32 %v, i32* %output, align 4
+; CHECK: st.global.u32 [%[[optr_g]]], %[[val]]
+  ret void
+}
+
+; Regular functions lower byval arguments differently. We need to make
+; sure that we're loading byval argument data using [symbol+offset].
+; There's also no assumption that all pointers within are in global space.
+define void @ptr_in_byval_func(%struct.S* byval %input, i32* %output) {
+; CHECK-LABEL: .visible .func ptr_in_byval_func(
+; CHECK: ld.param.u64  %[[optr:rd.*]], [ptr_in_byval_func_param_1]
+; CHECK: ld.param.u64  %[[iptr:rd.*]], [ptr_in_byval_func_param_0+8]
    %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
    %b = load i32*, i32** %b_ptr, align 4
    %v = load i32, i32* %b, align 4
-; CHECK: ld.global.u32
+; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]]
    store i32 %v, i32* %output, align 4
-; CHECK: st.global.u32
+; CHECK: st.u32 [%[[optr]]], %[[val]]
    ret void
  }
  
  !nvvm.annotations = !{!0, !1, !2}
  !0 = !{void (float*, float*)* @kernel, !"kernel", i32 1}
  !1 = !{void (float addrspace(1)*, float addrspace(1)*)* @kernel2, !"kernel", i32 1}
-!2 = !{void (%struct.S*, i32*)* @ptr_in_byval, !"kernel", i32 1}
+!2 = !{void (%struct.S*, i32*)* @ptr_in_byval_kernel, !"kernel", i32 1}
author	Artem Belevich <tra@google.com>
	Tue, 21 Jun 2016 20:30:26 +0000 (20:30 +0000)
committer	Artem Belevich <tra@google.com>
	Tue, 21 Jun 2016 20:30:26 +0000 (20:30 +0000)
lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp		patch \| blob \| history
lib/Target/NVPTX/NVPTXISelLowering.cpp		patch \| blob \| history
lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp		patch \| blob \| history
test/CodeGen/NVPTX/bug21465.ll		patch \| blob \| history
test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll		patch \| blob \| history