From e61b6779e4a9016df460ecd315e8c0186a6719df Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 13 Jul 2018 16:40:25 +0000
Subject: [PATCH] AMDGPU: Fix handling of alignment padding in DAG argument
 lowering

This was completely broken if there was ever a struct argument, as
this information is thrown away during the argument analysis.

The offsets as passed in to LowerFormalArguments are not useful,
as they partially depend on the legalized result register type,
and they don't consider the alignment in the first place.

Ignore the Ins array, and instead figure out from the raw IR type
what we need to do. This seems to fix the padding computation
if the DAG lowering is forced (and stops breaking arguments
following padded arguments if the arguments were only partially
lowered in the IR)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@337021 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp        |  14 +-
 .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp      |  11 +-
 lib/Target/AMDGPU/AMDGPUISelLowering.cpp      | 183 +++++++++++-------
 lib/Target/AMDGPU/AMDGPUISelLowering.h        |   7 +-
 .../AMDGPU/AMDGPULowerKernelArguments.cpp     |   5 +-
 lib/Target/AMDGPU/AMDGPUMachineFunction.cpp   |  11 +-
 lib/Target/AMDGPU/AMDGPUMachineFunction.h     |  15 +-
 lib/Target/AMDGPU/AMDGPUSubtarget.cpp         |  74 +++----
 lib/Target/AMDGPU/AMDGPUSubtarget.h           |  28 +--
 lib/Target/AMDGPU/R600.td                     |   5 -
 lib/Target/AMDGPU/R600ISelLowering.cpp        |  21 +-
 lib/Target/AMDGPU/SIISelLowering.cpp          |   9 +-
 lib/Target/AMDGPU/SIMachineFunctionInfo.cpp   |  24 +--
 test/CodeGen/AMDGPU/kernel-args.ll            |  77 ++++++--
 .../AMDGPU/kernel-argument-dag-lowering.ll    | 132 +++++++++++++
 .../AMDGPU/llvm.amdgcn.implicitarg.ptr.ll     |  16 +-
 .../AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll |   2 +-
 17 files changed, 420 insertions(+), 214 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 5b18aefbd78..152f8ecdf29 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1128,6 +1128,13 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
 void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
                                         const SIProgramInfo &CurrentProgramInfo,
                                         const MachineFunction &MF) const {
+  const Function &F = MF.getFunction();
+
+  // Avoid asserting on erroneous cases.
+  if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
+      F.getCallingConv() != CallingConv::SPIR_KERNEL)
+    return;
+
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
 
@@ -1174,9 +1181,8 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
   if (STM.isXNACKEnabled())
     Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
 
-  // FIXME: Should use getKernArgSize
-  Out.kernarg_segment_byte_size =
-    STM.getKernArgSegmentSize(MF.getFunction(), MFI->getExplicitKernArgSize());
+  unsigned MaxKernArgAlign;
+  Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
   Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
   Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
   Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
@@ -1185,7 +1191,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
   // These alignment values are specified in powers of two, so alignment =
   // 2^n.  The minimum alignment is 2^4 = 16.
   Out.kernarg_segment_alignment = std::max((size_t)4,
-      countTrailingZeros(MFI->getMaxKernArgAlign()));
+      countTrailingZeros(MaxKernArgAlign));
 
   if (STM.debuggerEmitPrologue()) {
     Out.debug_wavefront_private_segment_offset_sgpr =
diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index b33079ae4ba..29e93a9d9d3 100644
--- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -209,15 +209,16 @@ Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps(
   const Function &F = MF.getFunction();
 
   // Avoid asserting on erroneous cases.
-  if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL)
+  if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
+      F.getCallingConv() != CallingConv::SPIR_KERNEL)
     return HSACodeProps;
 
-  HSACodeProps.mKernargSegmentSize =
-      STM.getKernArgSegmentSize(F, MFI.getExplicitKernArgSize());
+  unsigned MaxKernArgAlign;
+  HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F,
+                                                               MaxKernArgAlign);
   HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
   HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
-  HSACodeProps.mKernargSegmentAlign =
-      std::max(uint32_t(4), MFI.getMaxKernArgAlign());
+  HSACodeProps.mKernargSegmentAlign = std::max(MaxKernArgAlign, 4u);
   HSACodeProps.mWavefrontSize = STM.getWavefrontSize();
   HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR;
   HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR;
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index acdedab7e13..583a09e34ab 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -30,6 +30,7 @@
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -40,18 +41,6 @@
 #include "llvm/Support/KnownBits.h"
 using namespace llvm;
 
-static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
-                            CCValAssign::LocInfo LocInfo,
-                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  MachineFunction &MF = State.getMachineFunction();
-  AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
-
-  uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
-                                         ArgFlags.getOrigAlign());
-  State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  return true;
-}
-
 static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
                            CCValAssign::LocInfo LocInfo,
                            ISD::ArgFlagsTy ArgFlags, CCState &State,
@@ -910,74 +899,118 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
 /// for each individual part is i8.  We pass the memory type as LocVT to the
 /// calling convention analysis function and the register type (Ins[x].VT) as
 /// the ValVT.
-void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
-                             const SmallVectorImpl<ISD::InputArg> &Ins) const {
-  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
-    const ISD::InputArg &In = Ins[i];
-    EVT MemVT;
-
-    unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
-
-    if (!Subtarget->isAmdHsaOS() &&
-        (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
-      // The ABI says the caller will extend these values to 32-bits.
-      MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
-    } else if (NumRegs == 1) {
-      // This argument is not split, so the IR type is the memory type.
-      assert(!In.Flags.isSplit());
-      if (In.ArgVT.isExtended()) {
-        // We have an extended type, like i24, so we should just use the register type
-        MemVT = In.VT;
-      } else {
-        MemVT = In.ArgVT;
-      }
-    } else if (In.ArgVT.isVector() && In.VT.isVector() &&
-               In.ArgVT.getScalarType() == In.VT.getScalarType()) {
-      assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
-      // We have a vector value which has been split into a vector with
-      // the same scalar type, but fewer elements.  This should handle
-      // all the floating-point vector types.
-      MemVT = In.VT;
-    } else if (In.ArgVT.isVector() &&
-               In.ArgVT.getVectorNumElements() == NumRegs) {
-      // This arg has been split so that each element is stored in a separate
-      // register.
-      MemVT = In.ArgVT.getScalarType();
-    } else if (In.ArgVT.isExtended()) {
-      // We have an extended type, like i65.
-      MemVT = In.VT;
-    } else {
-      unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
-      assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
-      if (In.VT.isInteger()) {
-        MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
-      } else if (In.VT.isVector()) {
-        assert(!In.VT.getScalarType().isFloatingPoint());
-        unsigned NumElements = In.VT.getVectorNumElements();
-        assert(MemoryBits % NumElements == 0);
-        // This vector type has been split into another vector type with
-        // a different elements size.
-        EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
-                                         MemoryBits / NumElements);
-        MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
+void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
+  CCState &State,
+  const SmallVectorImpl<ISD::InputArg> &Ins) const {
+  const MachineFunction &MF = State.getMachineFunction();
+  const Function &Fn = MF.getFunction();
+  LLVMContext &Ctx = Fn.getParent()->getContext();
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
+  const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
+
+  unsigned MaxAlign = 1;
+  uint64_t ExplicitArgOffset = 0;
+  const DataLayout &DL = Fn.getParent()->getDataLayout();
+
+  unsigned InIndex = 0;
+
+  for (const Argument &Arg : Fn.args()) {
+    Type *BaseArgTy = Arg.getType();
+    unsigned Align = DL.getABITypeAlignment(BaseArgTy);
+    MaxAlign = std::max(Align, MaxAlign);
+    unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
+
+    uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
+    ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
+
+    // We're basically throwing away everything passed into us and starting over
+    // to get accurate in-memory offsets. The "PartOffset" is completely useless
+    // to us as computed in Ins.
+    //
+    // We also need to figure out what type legalization is trying to do to get
+    // the correct memory offsets.
+
+    SmallVector<EVT, 16> ValueVTs;
+    SmallVector<uint64_t, 16> Offsets;
+    ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
+
+    for (unsigned Value = 0, NumValues = ValueVTs.size();
+         Value != NumValues; ++Value) {
+      uint64_t BasePartOffset = Offsets[Value];
+
+      EVT ArgVT = ValueVTs[Value];
+      EVT MemVT = ArgVT;
+      MVT RegisterVT =
+        getRegisterTypeForCallingConv(Ctx, ArgVT);
+      unsigned NumRegs =
+        getNumRegistersForCallingConv(Ctx, ArgVT);
+
+      if (!Subtarget->isAmdHsaOS() &&
+          (ArgVT == MVT::i16 || ArgVT == MVT::i8 || ArgVT == MVT::f16)) {
+        // The ABI says the caller will extend these values to 32-bits.
+        MemVT = ArgVT.isInteger() ? MVT::i32 : MVT::f32;
+      } else if (NumRegs == 1) {
+        // This argument is not split, so the IR type is the memory type.
+        if (ArgVT.isExtended()) {
+          // We have an extended type, like i24, so we should just use the
+          // register type.
+          MemVT = RegisterVT;
+        } else {
+          MemVT = ArgVT;
+        }
+      } else if (ArgVT.isVector() && RegisterVT.isVector() &&
+                 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
+        assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
+        // We have a vector value which has been split into a vector with
+        // the same scalar type, but fewer elements.  This should handle
+        // all the floating-point vector types.
+        MemVT = RegisterVT;
+      } else if (ArgVT.isVector() &&
+                 ArgVT.getVectorNumElements() == NumRegs) {
+        // This arg has been split so that each element is stored in a separate
+        // register.
+        MemVT = ArgVT.getScalarType();
+      } else if (ArgVT.isExtended()) {
+        // We have an extended type, like i65.
+        MemVT = RegisterVT;
       } else {
-        llvm_unreachable("cannot deduce memory type.");
+        unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
+        assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
+        if (RegisterVT.isInteger()) {
+          MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
+        } else if (RegisterVT.isVector()) {
+          assert(!RegisterVT.getScalarType().isFloatingPoint());
+          unsigned NumElements = RegisterVT.getVectorNumElements();
+          assert(MemoryBits % NumElements == 0);
+          // This vector type has been split into another vector type with
+          // a different elements size.
+          EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
+                                           MemoryBits / NumElements);
+          MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
+        } else {
+          llvm_unreachable("cannot deduce memory type.");
+        }
       }
-    }
 
-    // Convert one element vectors to scalar.
-    if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
-      MemVT = MemVT.getScalarType();
+      // Convert one element vectors to scalar.
+      if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
+        MemVT = MemVT.getScalarType();
 
-    if (MemVT.isExtended()) {
-      // This should really only happen if we have vec3 arguments
-      assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
-      MemVT = MemVT.getPow2VectorType(State.getContext());
-    }
+      if (MemVT.isExtended()) {
+        // This should really only happen if we have vec3 arguments
+        assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
+        MemVT = MemVT.getPow2VectorType(State.getContext());
+      }
 
-    assert(MemVT.isSimple());
-    allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
-                    State);
+      unsigned PartOffset = 0;
+      for (unsigned i = 0; i != NumRegs; ++i) {
+        State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
+                                               BasePartOffset + PartOffset,
+                                               MemVT.getSimpleVT(),
+                                               CCValAssign::Full));
+        PartOffset += MemVT.getStoreSize();
+      }
+    }
   }
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 1e027dd6712..096e40230c6 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -122,8 +122,11 @@ protected:
   SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
   void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &Results) const;
-  void analyzeFormalArgumentsCompute(CCState &State,
-                              const SmallVectorImpl<ISD::InputArg> &Ins) const;
+
+  void analyzeFormalArgumentsCompute(
+    CCState &State,
+    const SmallVectorImpl<ISD::InputArg> &Ins) const;
+
 public:
   AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
 
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 3c5760804b3..8cc7e38f7b2 100644
--- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -77,8 +77,9 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
   const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary
   const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
 
+  unsigned MaxAlign;
   // FIXME: Alignment is broken broken with explicit arg offset.;
-  const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F);
+  const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
   if (TotalKernArgSize == 0)
     return false;
 
@@ -91,13 +92,11 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
     Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
 
   unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
-  unsigned MaxAlign = 1;
   uint64_t ExplicitArgOffset = 0;
 
   for (Argument &Arg : F.args()) {
     Type *ArgTy = Arg.getType();
     unsigned Align = DL.getABITypeAlignment(ArgTy);
-    MaxAlign = std::max(Align, MaxAlign);
     unsigned Size = DL.getTypeSizeInBits(ArgTy);
     unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
 
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 0574c991ee6..13b4b50149c 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -24,16 +24,23 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath),
   MemoryBound(false),
   WaveLimiter(false) {
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
+
   // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
   // except reserved size is not correctly aligned.
+  const Function &F = MF.getFunction();
 
   if (auto *Resolver = MF.getMMI().getResolver()) {
     if (AMDGPUPerfHintAnalysis *PHA = static_cast<AMDGPUPerfHintAnalysis*>(
           Resolver->getAnalysisIfAvailable(&AMDGPUPerfHintAnalysisID, true))) {
-      MemoryBound = PHA->isMemoryBound(&MF.getFunction());
-      WaveLimiter = PHA->needsWaveLimiter(&MF.getFunction());
+      MemoryBound = PHA->isMemoryBound(&F);
+      WaveLimiter = PHA->needsWaveLimiter(&F);
     }
   }
+
+  CallingConv::ID CC = F.getCallingConv();
+  if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
+    ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
 }
 
 unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 2c4bf328008..8d6b871bc03 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -23,8 +23,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects;
 
 protected:
-  uint64_t ExplicitKernArgSize;
-  unsigned MaxKernArgAlign;
+  uint64_t ExplicitKernArgSize; // Cache for this.
+  unsigned MaxKernArgAlign; // Cache for this.
 
   /// Number of bytes in the LDS that are being used.
   unsigned LDSSize;
@@ -44,17 +44,6 @@ protected:
 public:
   AMDGPUMachineFunction(const MachineFunction &MF);
 
-  uint64_t allocateKernArg(uint64_t Size, unsigned Align) {
-    assert(isPowerOf2_32(Align));
-    ExplicitKernArgSize = alignTo(ExplicitKernArgSize, Align);
-
-    uint64_t Result = ExplicitKernArgSize;
-    ExplicitKernArgSize += Size;
-
-    MaxKernArgAlign = std::max(Align, MaxKernArgAlign);
-    return Result;
-  }
-
   uint64_t getExplicitKernArgSize() const {
     return ExplicitKernArgSize;
   }
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 3efc564c855..98b49070fa9 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -209,7 +209,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 
     FeatureDisable(false),
     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
-    TLInfo(TM, *this), 
+    TLInfo(TM, *this),
     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
   AS = AMDGPU::getAMDGPUAS(TT);
   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
@@ -406,6 +406,44 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
   return true;
 }
 
+uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
+                                                 unsigned &MaxAlign) const {
+  assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+         F.getCallingConv() == CallingConv::SPIR_KERNEL);
+
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  uint64_t ExplicitArgBytes = 0;
+  MaxAlign = 1;
+
+  for (const Argument &Arg : F.args()) {
+    Type *ArgTy = Arg.getType();
+
+    unsigned Align = DL.getABITypeAlignment(ArgTy);
+    uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
+    ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
+    MaxAlign = std::max(MaxAlign, Align);
+  }
+
+  return ExplicitArgBytes;
+}
+
+unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
+                                                unsigned &MaxAlign) const {
+  uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
+
+  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
+
+  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
+  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
+  if (ImplicitBytes != 0) {
+    unsigned Alignment = getAlignmentForImplicitArgPtr();
+    TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
+  }
+
+  // Being able to dereference past the end is useful for emitting scalar loads.
+  return alignTo(TotalSize, 4);
+}
+
 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
                              const TargetMachine &TM) :
   R600GenSubtargetInfo(TT, GPU, FS),
@@ -446,40 +484,6 @@ bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const {
   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
 }
 
-uint64_t GCNSubtarget::getExplicitKernArgSize(const Function &F) const {
-  assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL);
-
-  const DataLayout &DL = F.getParent()->getDataLayout();
-  uint64_t ExplicitArgBytes = 0;
-  for (const Argument &Arg : F.args()) {
-    Type *ArgTy = Arg.getType();
-
-    unsigned Align = DL.getABITypeAlignment(ArgTy);
-    uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
-    ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
-  }
-
-  return ExplicitArgBytes;
-}
-
-unsigned GCNSubtarget::getKernArgSegmentSize(const Function &F,
-                                            int64_t ExplicitArgBytes) const {
-  if (ExplicitArgBytes == -1)
-    ExplicitArgBytes = getExplicitKernArgSize(F);
-
-  unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
-
-  uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
-  unsigned ImplicitBytes = getImplicitArgNumBytes(F);
-  if (ImplicitBytes != 0) {
-    unsigned Alignment = getAlignmentForImplicitArgPtr();
-    TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
-  }
-
-  // Being able to dereference past the end is useful for emitting scalar loads.
-  return alignTo(TotalSize, 4);
-}
-
 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
     if (SGPRs <= 80)
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index d9806d6133c..62310973365 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -51,7 +51,7 @@ public:
   enum Generation {
     R600 = 0,
     R700 = 1,
-    EVERGREEN = 2, 
+    EVERGREEN = 2,
     NORTHERN_ISLANDS = 3,
     SOUTHERN_ISLANDS = 4,
     SEA_ISLANDS = 5,
@@ -82,7 +82,7 @@ public:
 
   static const AMDGPUSubtarget &get(const MachineFunction &MF);
   static const AMDGPUSubtarget &get(const TargetMachine &TM,
-                                          const Function &F);
+                                    const Function &F);
 
   /// \returns Default range flat work group size for a calling convention.
   std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
@@ -231,6 +231,18 @@ public:
   /// Creates value range metadata on an workitemid.* inrinsic call or load.
   bool makeLIDRangeMetadata(Instruction *I) const;
 
+  /// \returns Number of bytes of arguments that are passed to a shader or
+  /// kernel in addition to the explicit ones declared for the function.
+  unsigned getImplicitArgNumBytes(const Function &F) const {
+    if (isMesaKernel(F))
+      return 16;
+    return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
+  }
+  uint64_t getExplicitKernArgSize(const Function &F,
+                                  unsigned &MaxAlign) const;
+  unsigned getKernArgSegmentSize(const Function &F,
+                                 unsigned &MaxAlign) const;
+
   virtual ~AMDGPUSubtarget() {}
 };
 
@@ -669,14 +681,6 @@ public:
     return D16PreservesUnusedBits;
   }
 
-  /// \returns Number of bytes of arguments that are passed to a shader or
-  /// kernel in addition to the explicit ones declared for the function.
-  unsigned getImplicitArgNumBytes(const Function &F) const {
-    if (isMesaKernel(F))
-      return 16;
-    return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
-  }
-
   // Scratch is allocated in 256 dword per wave blocks for the entire
   // wavefront. When viewed from the perspecive of an arbitrary workitem, this
   // is 4-byte aligned.
@@ -825,10 +829,6 @@ public:
     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
   }
 
-  uint64_t getExplicitKernArgSize(const Function &F) const;
-  unsigned getKernArgSegmentSize(const Function &F,
-                                 int64_t ExplicitArgBytes = -1) const;
-
   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
   /// SGPRs
   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
diff --git a/lib/Target/AMDGPU/R600.td b/lib/Target/AMDGPU/R600.td
index ff96928211c..5c9c1c1ed50 100644
--- a/lib/Target/AMDGPU/R600.td
+++ b/lib/Target/AMDGPU/R600.td
@@ -52,8 +52,3 @@ def CC_R600 : CallingConv<[
     T30_XYZW, T31_XYZW, T32_XYZW
   ]>>>
 ]>;
-
-// Calling convention for compute kernels
-def CC_R600_Kernel : CallingConv<[
-  CCCustom<"allocateKernArg">
-]>;
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 4110e6a28d6..113d6249fa6 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -50,18 +50,6 @@
 
 using namespace llvm;
 
-static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
-                            CCValAssign::LocInfo LocInfo,
-                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  MachineFunction &MF = State.getMachineFunction();
-  AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
-
-  uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
-                                         ArgFlags.getOrigAlign());
-  State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  return true;
-}
-
 #include "R600GenCallingConv.inc"
 
 R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
@@ -234,7 +222,7 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FMA, MVT::f32, Expand);
     setOperationAction(ISD::FMA, MVT::f64, Expand);
   }
- 
+
   // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we
   // need it for R600.
   if (!Subtarget->hasFP32Denormals())
@@ -1583,7 +1571,7 @@ CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::Cold:
-    return CC_R600_Kernel;
+    llvm_unreachable("kernels should not be handled here");
   case CallingConv::AMDGPU_VS:
   case CallingConv::AMDGPU_GS:
   case CallingConv::AMDGPU_PS:
@@ -1658,13 +1646,12 @@ SDValue R600TargetLowering::LowerFormalArguments(
 
     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
     unsigned PartOffset = VA.getLocMemOffset();
-    unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF.getFunction()) +
-                      VA.getLocMemOffset();
 
     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
     SDValue Arg = DAG.getLoad(
         ISD::UNINDEXED, Ext, VT, DL, Chain,
-        DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,
+        DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32),
+        PtrInfo,
         MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal |
                                         MachineMemOperand::MODereferenceable |
                                         MachineMemOperand::MOInvariant);
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 5721669bf7c..177cec982f3 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1164,8 +1164,8 @@ SDValue SITargetLowering::lowerKernargMemParameter(
   // Try to avoid using an extload by loading earlier than the argument address,
   // and extracting the relevant bits. The load should hopefully be merged with
   // the previous argument.
-  if (Align < 4) {
-    assert(MemVT.getStoreSize() < 4);
+  if (MemVT.getStoreSize() < 4 && Align < 4) {
+    // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
     int64_t AlignDownOffset = alignDown(Offset, 4);
     int64_t OffsetDiff = Offset - AlignDownOffset;
 
@@ -1781,7 +1781,6 @@ SDValue SITargetLowering::LowerFormalArguments(
   // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
   // kern arg offset.
   const unsigned KernelArgBaseAlign = 16;
-  const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset(Fn);
 
    for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
     const ISD::InputArg &Arg = Ins[i];
@@ -1797,11 +1796,9 @@ SDValue SITargetLowering::LowerFormalArguments(
       VT = Ins[i].VT;
       EVT MemVT = VA.getLocVT();
 
-      const uint64_t Offset = ExplicitOffset + VA.getLocMemOffset();
+      const uint64_t Offset = VA.getLocMemOffset();
       unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
 
-      // The first 36 bytes of the input buffer contains information about
-      // thread group and global sizes for clover.
       SDValue Arg = lowerKernargMemParameter(
         DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
       Chains.push_back(Arg.getValue(1));
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 7c5bc7431e4..0d5ff75e37e 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -54,6 +54,16 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
 
   Occupancy = getMaxWavesPerEU();
   limitOccupancy(MF);
+  CallingConv::ID CC = F.getCallingConv();
+
+  if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
+    if (!F.arg_empty())
+      KernargSegmentPtr = true;
+    WorkGroupIDX = true;
+    WorkItemIDX = true;
+  } else if (CC == CallingConv::AMDGPU_PS) {
+    PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
+  }
 
   if (!isEntryFunction()) {
     // Non-entry functions have no special inputs for now, other registers
@@ -73,21 +83,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   } else {
     if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) {
       KernargSegmentPtr = true;
-      assert(MaxKernArgAlign == 0);
-      MaxKernArgAlign =  ST.getAlignmentForImplicitArgPtr();
+      MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(),
+                                 MaxKernArgAlign);
     }
   }
 
-  CallingConv::ID CC = F.getCallingConv();
-  if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
-    if (!F.arg_empty())
-      KernargSegmentPtr = true;
-    WorkGroupIDX = true;
-    WorkItemIDX = true;
-  } else if (CC == CallingConv::AMDGPU_PS) {
-    PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
-  }
-
   if (ST.debuggerEmitPrologue()) {
     // Enable everything.
     WorkGroupIDX = true;
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll
index 5c2c868476b..9492b710d13 100644
--- a/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -589,6 +589,17 @@ entry:
 ;   ret void
 ; }
 
+; FUNC-LABEL: {{^}}i65_arg:
+; HSA-VI: kernarg_segment_byte_size = 24
+; HSA-VI: kernarg_segment_alignment = 4
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind {
+entry:
+  store i65 %in, i65 addrspace(1)* %out, align 4
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}i1_arg:
 ; HSA-VI: kernarg_segment_byte_size = 12
 ; HSA-VI: kernarg_segment_alignment = 4
@@ -651,7 +662,7 @@ define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwi
 }
 
 ; FUNC-LABEL: {{^}}empty_struct_arg:
-; HSA: kernarg_segment_byte_size = 0
+; HSA-VI: kernarg_segment_byte_size = 0
 define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
   ret void
 }
@@ -667,11 +678,11 @@ define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
 
 ; FIXME: Total argument size is computed wrong
 ; FUNC-LABEL: {{^}}struct_argument_alignment:
-; HSA: kernarg_segment_byte_size = 40
-; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
-; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
-; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
+; HSA-VI: kernarg_segment_byte_size = 40
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
 define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
   %val0 = extractvalue {i32, i64} %arg0, 0
   %val1 = extractvalue {i32, i64} %arg0, 1
@@ -687,11 +698,11 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
 ; No padding between i8 and next struct, but round up at end to 4 byte
 ; multiple.
 ; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
-; HSA: kernarg_segment_byte_size = 28
-; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
-; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
-; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
+; HSA-VI: kernarg_segment_byte_size = 28
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
   %val0 = extractvalue <{i32, i64}> %arg0, 0
   %val1 = extractvalue <{i32, i64}> %arg0, 1
@@ -703,3 +714,47 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
   store volatile i64 %val3, i64 addrspace(1)* null
   ret void
 }
+
+; GCN-LABEL: {{^}}struct_argument_alignment_after:
+; HSA-VI: kernarg_segment_byte_size = 64
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
+; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30
+define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
+  %val0 = extractvalue {i32, i64} %arg0, 0
+  %val1 = extractvalue {i32, i64} %arg0, 1
+  %val2 = extractvalue {i32, i64} %arg2, 0
+  %val3 = extractvalue {i32, i64} %arg2, 1
+  store volatile i32 %val0, i32 addrspace(1)* null
+  store volatile i64 %val1, i64 addrspace(1)* null
+  store volatile i32 %val2, i32 addrspace(1)* null
+  store volatile i64 %val3, i64 addrspace(1)* null
+  store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null
+  ret void
+}
+
+; GCN-LABEL: {{^}}array_3xi32:
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
+define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
+  store volatile i16 %arg0, i16 addrspace(1)* undef
+  store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef
+  ret void
+}
+
+; FIXME: Why not all scalar loads?
+; GCN-LABEL: {{^}}array_3xi16:
+; HSA-VI: s_add_u32 s{{[0-9]+}}, s4, 2
+; HSA-VI: s_addc_u32 s{{[0-9]+}}, s5, 0
+; HSA-VI: flat_load_ushort
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
+define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
+  store volatile i8 %arg0, i8 addrspace(1)* undef
+  store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
new file mode 100644
index 00000000000..a1bb6c28e74
--- /dev/null
+++ b/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll
@@ -0,0 +1,132 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
+
+; Repeat of some problematic tests in kernel-args.ll, with the IR
+; argument lowering pass disabled. Struct padding needs to be
+; accounted for, as well as legalization of types changing offsets.
+
+; FUNC-LABEL: {{^}}i1_arg:
+; HSA-VI: kernarg_segment_byte_size = 12
+; HSA-VI: kernarg_segment_alignment = 4
+
+; GCN: s_load_dword s
+; GCN: s_and_b32
+define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
+  store i1 %x, i1 addrspace(1)* %out, align 1
+  ret void
+}
+
+; FUNC-LABEL: {{^}}v3i8_arg:
+; HSA-VI: kernarg_segment_byte_size = 12
+; HSA-VI: kernarg_segment_alignment = 4
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
+define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
+entry:
+  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}i65_arg:
+; HSA-VI: kernarg_segment_byte_size = 24
+; HSA-VI: kernarg_segment_alignment = 4
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind {
+entry:
+  store i65 %in, i65 addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}empty_struct_arg:
+; HSA-VI: kernarg_segment_byte_size = 0
+define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
+  ret void
+}
+
+; The correct load offsets for these:
+; load 4 from 0,
+; load 8 from 8
+; load 4 from 24
+; load 8 from 32
+
+; With the SelectionDAG argument lowering, the alignments for the
+; struct members is not properly considered, making these wrong.
+
+; FIXME: Total argument size is computed wrong
+; FUNC-LABEL: {{^}}struct_argument_alignment:
+; HSA-VI: kernarg_segment_byte_size = 40
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
+define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
+  %val0 = extractvalue {i32, i64} %arg0, 0
+  %val1 = extractvalue {i32, i64} %arg0, 1
+  %val2 = extractvalue {i32, i64} %arg1, 0
+  %val3 = extractvalue {i32, i64} %arg1, 1
+  store volatile i32 %val0, i32 addrspace(1)* null
+  store volatile i64 %val1, i64 addrspace(1)* null
+  store volatile i32 %val2, i32 addrspace(1)* null
+  store volatile i64 %val3, i64 addrspace(1)* null
+  ret void
+}
+
+; No padding between i8 and next struct, but round up at end to 4 byte
+; multiple.
+; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
+; HSA-VI: kernarg_segment_byte_size = 28
+; HSA-VI: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
+; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
+define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
+  %val0 = extractvalue <{i32, i64}> %arg0, 0
+  %val1 = extractvalue <{i32, i64}> %arg0, 1
+  %val2 = extractvalue <{i32, i64}> %arg1, 0
+  %val3 = extractvalue <{i32, i64}> %arg1, 1
+  store volatile i32 %val0, i32 addrspace(1)* null
+  store volatile i64 %val1, i64 addrspace(1)* null
+  store volatile i32 %val2, i32 addrspace(1)* null
+  store volatile i64 %val3, i64 addrspace(1)* null
+  ret void
+}
+
+; GCN-LABEL: {{^}}struct_argument_alignment_after:
+; HSA-VI: kernarg_segment_byte_size = 64
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
+; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30
+define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
+  %val0 = extractvalue {i32, i64} %arg0, 0
+  %val1 = extractvalue {i32, i64} %arg0, 1
+  %val2 = extractvalue {i32, i64} %arg2, 0
+  %val3 = extractvalue {i32, i64} %arg2, 1
+  store volatile i32 %val0, i32 addrspace(1)* null
+  store volatile i64 %val1, i64 addrspace(1)* null
+  store volatile i32 %val2, i32 addrspace(1)* null
+  store volatile i64 %val3, i64 addrspace(1)* null
+  store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null
+  ret void
+}
+
+; GCN-LABEL: {{^}}array_3xi32:
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
+define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
+  store volatile i16 %arg0, i16 addrspace(1)* undef
+  store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}array_3xi16:
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
+define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
+  store volatile i8 %arg0, i8 addrspace(1)* undef
+  store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
index f860a122a88..6a9191e7dcb 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
@@ -33,7 +33,7 @@ define amdgpu_kernel void @opencl_kernel_implicitarg_ptr_empty() #1 {
 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
 
 ; HSA: kernarg_segment_byte_size = 112
-; MESA: kernarg_segment_byte_size = 464
+; MESA: kernarg_segment_byte_size = 128
 
 ; HSA: s_load_dword s0, s[4:5], 0x1c
 define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 {
@@ -47,7 +47,7 @@ define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 {
 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
 
 ; HSA: kernarg_segment_byte_size = 160
-; MESA: kernarg_segment_byte_size = 464
+; MESA: kernarg_segment_byte_size = 128
 
 ; HSA: s_load_dword s0, s[4:5], 0x1c
 define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 {
@@ -118,10 +118,10 @@ define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 {
 ; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func:
 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
 ; HSA: kernarg_segment_byte_size = 112
-; MESA: kernarg_segment_byte_size = 464
+; MESA: kernarg_segment_byte_size = 128
 
 ; HSA: s_add_u32 s6, s4, 0x70
-; MESA: s_add_u32 s6, s4, 0x1c0
+; MESA: s_add_u32 s6, s4, 0x70
 
 ; GCN: s_addc_u32 s7, s5, 0{{$}}
 ; GCN: s_swappc_b64
@@ -133,10 +133,9 @@ define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 {
 ; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func:
 ; GCN: enable_sgpr_kernarg_segment_ptr = 1
 ; HSA: kernarg_segment_byte_size = 160
-; MESA: kernarg_segment_byte_size = 464
+; MESA: kernarg_segment_byte_size = 128
 
-; HSA: s_add_u32 s6, s4, 0x70
-; MESA: s_add_u32 s6, s4, 0x1c0
+; GCN: s_add_u32 s6, s4, 0x70
 
 ; GCN: s_addc_u32 s7, s5, 0{{$}}
 ; GCN: s_swappc_b64
@@ -219,8 +218,7 @@ define void @opencl_func_kernarg_implicitarg_ptr() #0 {
 
 ; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func:
 ; GCN: s_mov_b64 s[6:7], s[4:5]
-; HSA: s_add_u32 s8, s6, 0x70
-; MESA: s_add_u32 s8, s6, 0x1c0
+; GCN: s_add_u32 s8, s6, 0x70
 ; GCN: s_addc_u32 s9, s7, 0
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 {
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
index 6c1bc9eaa76..5853d8d8e4e 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -79,7 +79,7 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(i32 addrspace(1)* %out
 ; CO-V2: enable_sgpr_kernarg_segment_ptr = 1
 ; HSA: kernarg_segment_byte_size = 0
 ; OS-MESA3D: kernarg_segment_byte_size = 16
-; CO-V2: kernarg_segment_alignment = 32
+; CO-V2: kernarg_segment_alignment = 4
 
 ; HSA: s_load_dword s{{[0-9]+}}, s[4:5]
 define amdgpu_kernel void @test_no_kernargs() #1 {
-- 
2.50.1