From 3568925e1dc883ae5a9a301d0475e9997975da27 Mon Sep 17 00:00:00 2001
From: Reid Kleckner <reid@kleckner.net>
Date: Fri, 31 Oct 2014 22:00:51 +0000
Subject: [PATCH] Implement IRGen for the x86 vectorcall convention

The most complex aspect of the convention is the handling of homogeneous
vector and floating point aggregates.  Reuse the homogeneous aggregate
classification code that we use on PPC64 and ARM for this.

This convention also has a C mangling, and we apparently implement that
in both Clang and LLVM.

Reviewed By: majnemer

Differential Revision: http://reviews.llvm.org/D6063

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@221006 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/AST/Mangle.cpp                         |  53 ++++---
 lib/Basic/Targets.cpp                      |   2 +
 lib/CodeGen/CGCall.cpp                     |   8 +-
 lib/CodeGen/MicrosoftCXXABI.cpp            |  11 +-
 lib/CodeGen/TargetInfo.cpp                 | 152 +++++++++++++++++----
 test/CodeGen/mangle-windows.c              |  39 +++++-
 test/CodeGen/microsoft-call-conv.c         |   7 +-
 test/CodeGen/vectorcall.c                  |  77 +++++++++++
 test/CodeGenCXX/homogeneous-aggregates.cpp |  45 +++++-
 9 files changed, 332 insertions(+), 62 deletions(-)
 create mode 100644 test/CodeGen/vectorcall.c
diff --git a/lib/AST/Mangle.cpp b/lib/AST/Mangle.cpp
index b5dbdca9c9..ae6c5f296d 100644
--- a/lib/AST/Mangle.cpp
+++ b/lib/AST/Mangle.cpp
@@ -49,10 +49,11 @@ static void mangleFunctionBlock(MangleContext &Context,
 
 void MangleContext::anchor() { }
 
-enum StdOrFastCC {
-  SOF_OTHER,
-  SOF_FAST,
-  SOF_STD
+enum CCMangling {
+  CCM_Other,
+  CCM_Fast,
+  CCM_Vector,
+  CCM_Std
 };
 
 static bool isExternC(const NamedDecl *ND) {
@@ -61,20 +62,22 @@ static bool isExternC(const NamedDecl *ND) {
   return cast<VarDecl>(ND)->isExternC();
 }
 
-static StdOrFastCC getStdOrFastCallMangling(const ASTContext &Context,
-                                            const NamedDecl *ND) {
+static CCMangling getCallingConvMangling(const ASTContext &Context,
+                                         const NamedDecl *ND) {
   const TargetInfo &TI = Context.getTargetInfo();
   const llvm::Triple &Triple = TI.getTriple();
-  if (!Triple.isOSWindows() || Triple.getArch() != llvm::Triple::x86)
-    return SOF_OTHER;
+  if (!Triple.isOSWindows() ||
+      !(Triple.getArch() == llvm::Triple::x86 ||
+        Triple.getArch() == llvm::Triple::x86_64))
+    return CCM_Other;
 
   if (Context.getLangOpts().CPlusPlus && !isExternC(ND) &&
       TI.getCXXABI() == TargetCXXABI::Microsoft)
-    return SOF_OTHER;
+    return CCM_Other;
 
   const FunctionDecl *FD = dyn_cast<FunctionDecl>(ND);
   if (!FD)
-    return SOF_OTHER;
+    return CCM_Other;
   QualType T = FD->getType();
 
   const FunctionType *FT = T->castAs<FunctionType>();
@@ -82,19 +85,21 @@ static StdOrFastCC getStdOrFastCallMangling(const ASTContext &Context,
   CallingConv CC = FT->getCallConv();
   switch (CC) {
   default:
-    return SOF_OTHER;
+    return CCM_Other;
   case CC_X86FastCall:
-    return SOF_FAST;
+    return CCM_Fast;
   case CC_X86StdCall:
-    return SOF_STD;
+    return CCM_Std;
+  case CC_X86VectorCall:
+    return CCM_Vector;
   }
 }
 
 bool MangleContext::shouldMangleDeclName(const NamedDecl *D) {
   const ASTContext &ASTContext = getASTContext();
 
-  StdOrFastCC CC = getStdOrFastCallMangling(ASTContext, D);
-  if (CC != SOF_OTHER)
+  CCMangling CC = getCallingConvMangling(ASTContext, D);
+  if (CC != CCM_Other)
     return true;
 
   // In C, functions with no attributes never need to be mangled. Fastpath them.
@@ -131,10 +136,10 @@ void MangleContext::mangleName(const NamedDecl *D, raw_ostream &Out) {
   }
 
   const ASTContext &ASTContext = getASTContext();
-  StdOrFastCC CC = getStdOrFastCallMangling(ASTContext, D);
+  CCMangling CC = getCallingConvMangling(ASTContext, D);
   bool MCXX = shouldMangleCXXName(D);
   const TargetInfo &TI = Context.getTargetInfo();
-  if (CC == SOF_OTHER || (MCXX && TI.getCXXABI() == TargetCXXABI::Microsoft)) {
+  if (CC == CCM_Other || (MCXX && TI.getCXXABI() == TargetCXXABI::Microsoft)) {
     if (const ObjCMethodDecl *OMD = dyn_cast<ObjCMethodDecl>(D))
       mangleObjCMethodName(OMD, Out);
     else
@@ -143,9 +148,9 @@ void MangleContext::mangleName(const NamedDecl *D, raw_ostream &Out) {
   }
 
   Out << '\01';
-  if (CC == SOF_STD)
+  if (CC == CCM_Std)
     Out << '_';
-  else
+  else if (CC == CCM_Fast)
     Out << '@';
 
   if (!MCXX)
@@ -158,6 +163,8 @@ void MangleContext::mangleName(const NamedDecl *D, raw_ostream &Out) {
   const FunctionDecl *FD = cast<FunctionDecl>(D);
   const FunctionType *FT = FD->getType()->castAs<FunctionType>();
   const FunctionProtoType *Proto = dyn_cast<FunctionProtoType>(FT);
+  if (CC == CCM_Vector)
+    Out << '@';
   Out << '@';
   if (!Proto) {
     Out << '0';
@@ -169,9 +176,11 @@ void MangleContext::mangleName(const NamedDecl *D, raw_ostream &Out) {
     if (!MD->isStatic())
       ++ArgWords;
   for (const auto &AT : Proto->param_types())
-    // Size should be aligned to DWORD boundary
-    ArgWords += llvm::RoundUpToAlignment(ASTContext.getTypeSize(AT), 32) / 32;
-  Out << 4 * ArgWords;
+    // Size should be aligned to pointer size.
+    ArgWords += llvm::RoundUpToAlignment(ASTContext.getTypeSize(AT),
+                                         TI.getPointerWidth(0)) /
+                TI.getPointerWidth(0);
+  Out << ((TI.getPointerWidth(0) / 8) * ArgWords);
 }
 
 void MangleContext::mangleGlobalBlock(const BlockDecl *BD,
diff --git a/lib/Basic/Targets.cpp b/lib/Basic/Targets.cpp
index 587b13fe41..c9c05b845d 100644
--- a/lib/Basic/Targets.cpp
+++ b/lib/Basic/Targets.cpp
@@ -3503,6 +3503,7 @@ public:
 
   CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
     return (CC == CC_C ||
+            CC == CC_X86VectorCall ||
             CC == CC_IntelOclBicc ||
             CC == CC_X86_64Win64) ? CCCR_OK : CCCR_Warning;
   }
@@ -3542,6 +3543,7 @@ public:
   }
   CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
     return (CC == CC_C ||
+            CC == CC_X86VectorCall ||
             CC == CC_IntelOclBicc ||
             CC == CC_X86_64SysV) ? CCCR_OK : CCCR_Warning;
   }
diff --git a/lib/CodeGen/CGCall.cpp b/lib/CodeGen/CGCall.cpp
index b31cdf3a90..564a754a81 100644
--- a/lib/CodeGen/CGCall.cpp
+++ b/lib/CodeGen/CGCall.cpp
@@ -50,7 +50,7 @@ static unsigned ClangCallConvToLLVMCallConv(CallingConv CC) {
   // TODO: Add support for __pascal to LLVM.
   case CC_X86Pascal: return llvm::CallingConv::C;
   // TODO: Add support for __vectorcall to LLVM.
-  case CC_X86VectorCall: return llvm::CallingConv::C;
+  case CC_X86VectorCall: return llvm::CallingConv::X86_VectorCall;
   }
 }
 
@@ -603,6 +603,9 @@ getTypeExpansion(QualType Ty, const ASTContext &Context) {
       CharUnits UnionSize = CharUnits::Zero();
 
       for (const auto *FD : RD->fields()) {
+        // Skip zero length bitfields.
+        if (FD->isBitField() && FD->getBitWidthValue(Context) == 0)
+          continue;
         assert(!FD->isBitField() &&
                "Cannot expand structure with bit-field members.");
         CharUnits FieldSize = Context.getTypeSizeInChars(FD->getType());
@@ -622,6 +625,9 @@ getTypeExpansion(QualType Ty, const ASTContext &Context) {
       }
 
       for (const auto *FD : RD->fields()) {
+        // Skip zero length bitfields.
+        if (FD->isBitField() && FD->getBitWidthValue(Context) == 0)
+          continue;
         assert(!FD->isBitField() &&
                "Cannot expand structure with bit-field members.");
         Fields.push_back(FD);
diff --git a/lib/CodeGen/MicrosoftCXXABI.cpp b/lib/CodeGen/MicrosoftCXXABI.cpp
index 49907c965d..428c7b11a4 100644
--- a/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -617,8 +617,15 @@ MicrosoftCXXABI::getRecordArgABI(const CXXRecordDecl *RD) const {
     if (RD->hasNonTrivialCopyConstructor())
       return RAA_Indirect;
 
-    // Win64 passes objects larger than 8 bytes indirectly.
-    if (getContext().getTypeSize(RD->getTypeForDecl()) > 64)
+    // If an object has a destructor, we'd really like to pass it indirectly
+    // because it allows us to elide copies.  Unfortunately, MSVC makes that
+    // impossible for small types, which it will pass in a single register or
+    // stack slot. Most objects with dtors are large-ish, so handle that early.
+    // We can't call out all large objects as being indirect because there are
+    // multiple x64 calling conventions and the C++ ABI code shouldn't dictate
+    // how we pass large POD types.
+    if (RD->hasNonTrivialDestructor() &&
+        getContext().getTypeSize(RD->getTypeForDecl()) > 64)
       return RAA_Indirect;
 
     // We have a trivial copy constructor or no copy constructors, but we have
diff --git a/lib/CodeGen/TargetInfo.cpp b/lib/CodeGen/TargetInfo.cpp
index ed9e83fda3..c776db6104 100644
--- a/lib/CodeGen/TargetInfo.cpp
+++ b/lib/CodeGen/TargetInfo.cpp
@@ -508,18 +508,39 @@ static llvm::Type* X86AdjustInlineAsmType(CodeGen::CodeGenFunction &CGF,
   return Ty;
 }
 
+/// Returns true if this type can be passed in SSE registers with the
+/// X86_VectorCall calling convention. Shared between x86_32 and x86_64.
+static bool isX86VectorTypeForVectorCall(ASTContext &Context, QualType Ty) {
+  if (const BuiltinType *BT = Ty->getAs<BuiltinType>()) {
+    if (BT->isFloatingPoint() && BT->getKind() != BuiltinType::Half)
+      return true;
+  } else if (const VectorType *VT = Ty->getAs<VectorType>()) {
+    // vectorcall can pass XMM, YMM, and ZMM vectors. We don't pass SSE1 MMX
+    // registers specially.
+    unsigned VecSize = Context.getTypeSize(VT);
+    if (VecSize == 128 || VecSize == 256 || VecSize == 512)
+      return true;
+  }
+  return false;
+}
+
+/// Returns true if this aggregate is small enough to be passed in SSE registers
+/// in the X86_VectorCall calling convention. Shared between x86_32 and x86_64.
+static bool isX86VectorCallAggregateSmallEnough(uint64_t NumMembers) {
+  return NumMembers <= 4;
+}
+
 //===----------------------------------------------------------------------===//
 // X86-32 ABI Implementation
 //===----------------------------------------------------------------------===//
 
 /// \brief Similar to llvm::CCState, but for Clang.
 struct CCState {
-  CCState(unsigned CC) : CC(CC), FreeRegs(0) {}
+  CCState(unsigned CC) : CC(CC), FreeRegs(0), FreeSSERegs(0) {}
 
   unsigned CC;
   unsigned FreeRegs;
-  unsigned StackOffset;
-  bool UseInAlloca;
+  unsigned FreeSSERegs;
 };
 
 /// X86_32ABIInfo - The X86-32 ABI information.
@@ -540,6 +561,17 @@ class X86_32ABIInfo : public ABIInfo {
     return (Size == 8 || Size == 16 || Size == 32 || Size == 64);
   }
 
+  bool isHomogeneousAggregateBaseType(QualType Ty) const override {
+    // FIXME: Assumes vectorcall is in use.
+    return isX86VectorTypeForVectorCall(getContext(), Ty);
+  }
+
+  bool isHomogeneousAggregateSmallEnough(const Type *Ty,
+                                         uint64_t NumMembers) const override {
+    // FIXME: Assumes vectorcall is in use.
+    return isX86VectorCallAggregateSmallEnough(NumMembers);
+  }
+
   bool shouldReturnTypeInRegister(QualType Ty, ASTContext &Context) const;
 
   /// getIndirectResult - Give a source type \arg Ty, return a suitable result
@@ -767,6 +799,14 @@ ABIArgInfo X86_32ABIInfo::classifyReturnType(QualType RetTy, CCState &State) con
   if (RetTy->isVoidType())
     return ABIArgInfo::getIgnore();
 
+  const Type *Base = nullptr;
+  uint64_t NumElts = 0;
+  if (State.CC == llvm::CallingConv::X86_VectorCall &&
+      isHomogeneousAggregate(RetTy, Base, NumElts)) {
+    // The LLVM struct type for such an aggregate should lower properly.
+    return ABIArgInfo::getDirect();
+  }
+
   if (const VectorType *VT = RetTy->getAs<VectorType>()) {
     // On Darwin, some vectors are returned in registers.
     if (IsDarwinVectorABI) {
@@ -939,7 +979,8 @@ bool X86_32ABIInfo::shouldUseInReg(QualType Ty, CCState &State,
 
   State.FreeRegs -= SizeInRegs;
 
-  if (State.CC == llvm::CallingConv::X86_FastCall) {
+  if (State.CC == llvm::CallingConv::X86_FastCall ||
+      State.CC == llvm::CallingConv::X86_VectorCall) {
     if (Size > 32)
       return false;
 
@@ -964,17 +1005,36 @@ bool X86_32ABIInfo::shouldUseInReg(QualType Ty, CCState &State,
 ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
                                                CCState &State) const {
   // FIXME: Set alignment on indirect arguments.
-  if (isAggregateTypeForABI(Ty)) {
-    if (const RecordType *RT = Ty->getAs<RecordType>()) {
-      // Check with the C++ ABI first.
-      CGCXXABI::RecordArgABI RAA = getRecordArgABI(RT, getCXXABI());
-      if (RAA == CGCXXABI::RAA_Indirect) {
-        return getIndirectResult(Ty, false, State);
-      } else if (RAA == CGCXXABI::RAA_DirectInMemory) {
-        // The field index doesn't matter, we'll fix it up later.
-        return ABIArgInfo::getInAlloca(/*FieldIndex=*/0);
-      }
 
+  // Check with the C++ ABI first.
+  const RecordType *RT = Ty->getAs<RecordType>();
+  if (RT) {
+    CGCXXABI::RecordArgABI RAA = getRecordArgABI(RT, getCXXABI());
+    if (RAA == CGCXXABI::RAA_Indirect) {
+      return getIndirectResult(Ty, false, State);
+    } else if (RAA == CGCXXABI::RAA_DirectInMemory) {
+      // The field index doesn't matter, we'll fix it up later.
+      return ABIArgInfo::getInAlloca(/*FieldIndex=*/0);
+    }
+  }
+
+  // vectorcall adds the concept of a homogenous vector aggregate, similar
+  // to other targets.
+  const Type *Base = nullptr;
+  uint64_t NumElts = 0;
+  if (State.CC == llvm::CallingConv::X86_VectorCall &&
+      isHomogeneousAggregate(Ty, Base, NumElts)) {
+    if (State.FreeSSERegs >= NumElts) {
+      State.FreeSSERegs -= NumElts;
+      if (Ty->isBuiltinType() || Ty->isVectorType())
+        return ABIArgInfo::getDirect();
+      return ABIArgInfo::getExpand();
+    }
+    return getIndirectResult(Ty, /*ByVal=*/false, State);
+  }
+
+  if (isAggregateTypeForABI(Ty)) {
+    if (RT) {
       // Structs are always byval on win32, regardless of what they contain.
       if (IsWin32StructABI)
         return getIndirectResult(Ty, true, State);
@@ -1006,7 +1066,9 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
     if (getContext().getTypeSize(Ty) <= 4*32 &&
         canExpandIndirectArgument(Ty, getContext()))
       return ABIArgInfo::getExpandWithPadding(
-          State.CC == llvm::CallingConv::X86_FastCall, PaddingType);
+          State.CC == llvm::CallingConv::X86_FastCall ||
+              State.CC == llvm::CallingConv::X86_VectorCall,
+          PaddingType);
 
     return getIndirectResult(Ty, true, State);
   }
@@ -1049,7 +1111,10 @@ void X86_32ABIInfo::computeInfo(CGFunctionInfo &FI) const {
   CCState State(FI.getCallingConvention());
   if (State.CC == llvm::CallingConv::X86_FastCall)
     State.FreeRegs = 2;
-  else if (FI.getHasRegParm())
+  else if (State.CC == llvm::CallingConv::X86_VectorCall) {
+    State.FreeRegs = 2;
+    State.FreeSSERegs = 6;
+  } else if (FI.getHasRegParm())
     State.FreeRegs = FI.getRegParm();
   else
     State.FreeRegs = DefaultNumRegisterParameters;
@@ -1434,7 +1499,8 @@ public:
 /// WinX86_64ABIInfo - The Windows X86_64 ABI information.
 class WinX86_64ABIInfo : public ABIInfo {
 
-  ABIArgInfo classify(QualType Ty, bool IsReturnType) const;
+  ABIArgInfo classify(QualType Ty, unsigned &FreeSSERegs,
+                      bool IsReturnType) const;
 
 public:
   WinX86_64ABIInfo(CodeGen::CodeGenTypes &CGT) : ABIInfo(CGT) {}
@@ -1443,6 +1509,17 @@ public:
 
   llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
                          CodeGenFunction &CGF) const override;
+
+  bool isHomogeneousAggregateBaseType(QualType Ty) const override {
+    // FIXME: Assumes vectorcall is in use.
+    return isX86VectorTypeForVectorCall(getContext(), Ty);
+  }
+
+  bool isHomogeneousAggregateSmallEnough(const Type *Ty,
+                                         uint64_t NumMembers) const override {
+    // FIXME: Assumes vectorcall is in use.
+    return isX86VectorCallAggregateSmallEnough(NumMembers);
+  }
 };
 
 class X86_64TargetCodeGenInfo : public TargetCodeGenInfo {
@@ -2844,7 +2921,8 @@ llvm::Value *X86_64ABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
   return ResAddr;
 }
 
-ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, bool IsReturnType) const {
+ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
+                                      bool IsReturnType) const {
 
   if (Ty->isVoidType())
     return ABIArgInfo::getIgnore();
@@ -2852,7 +2930,9 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, bool IsReturnType) const {
   if (const EnumType *EnumTy = Ty->getAs<EnumType>())
     Ty = EnumTy->getDecl()->getIntegerType();
 
-  uint64_t Size = getContext().getTypeSize(Ty);
+  TypeInfo Info = getContext().getTypeInfo(Ty);
+  uint64_t Width = Info.Width;
+  unsigned Align = getContext().toCharUnitsFromBits(Info.Align).getQuantity();
 
   const RecordType *RT = Ty->getAs<RecordType>();
   if (RT) {
@@ -2865,11 +2945,26 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, bool IsReturnType) const {
       return ABIArgInfo::getIndirect(0, /*ByVal=*/false);
 
     // FIXME: mingw-w64-gcc emits 128-bit struct as i128
-    if (Size == 128 && getTarget().getTriple().isWindowsGNUEnvironment())
+    if (Width == 128 && getTarget().getTriple().isWindowsGNUEnvironment())
       return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(),
-                                                          Size));
+                                                          Width));
+  }
+
+  // vectorcall adds the concept of a homogenous vector aggregate, similar to
+  // other targets.
+  const Type *Base = nullptr;
+  uint64_t NumElts = 0;
+  if (FreeSSERegs && isHomogeneousAggregate(Ty, Base, NumElts)) {
+    if (FreeSSERegs >= NumElts) {
+      FreeSSERegs -= NumElts;
+      if (IsReturnType || Ty->isBuiltinType() || Ty->isVectorType())
+        return ABIArgInfo::getDirect();
+      return ABIArgInfo::getExpand();
+    }
+    return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
   }
 
+
   if (Ty->isMemberPointerType()) {
     // If the member pointer is represented by an LLVM int or ptr, pass it
     // directly.
@@ -2881,11 +2976,11 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, bool IsReturnType) const {
   if (RT || Ty->isMemberPointerType()) {
     // MS x64 ABI requirement: "Any argument that doesn't fit in 8 bytes, or is
     // not 1, 2, 4, or 8 bytes, must be passed by reference."
-    if (Size > 64 || !llvm::isPowerOf2_64(Size))
+    if (Width > 64 || !llvm::isPowerOf2_64(Width))
       return ABIArgInfo::getIndirect(0, /*ByVal=*/false);
 
     // Otherwise, coerce it to a small integer.
-    return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Size));
+    return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Width));
   }
 
   // Bool type is always extended to the ABI, other builtin types are not
@@ -2898,11 +2993,18 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, bool IsReturnType) const {
 }
 
 void WinX86_64ABIInfo::computeInfo(CGFunctionInfo &FI) const {
+  bool IsVectorCall =
+      FI.getCallingConvention() == llvm::CallingConv::X86_VectorCall;
+
+  // We can use up to 4 SSE return registers with vectorcall.
+  unsigned FreeSSERegs = IsVectorCall ? 4 : 0;
   if (!getCXXABI().classifyReturnType(FI))
-    FI.getReturnInfo() = classify(FI.getReturnType(), true);
+    FI.getReturnInfo() = classify(FI.getReturnType(), FreeSSERegs, true);
 
+  // We can use up to 6 SSE register parameters with vectorcall.
+  FreeSSERegs = IsVectorCall ? 6 : 0;
   for (auto &I : FI.arguments())
-    I.info = classify(I.type, false);
+    I.info = classify(I.type, FreeSSERegs, false);
 }
 
 llvm::Value *WinX86_64ABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
diff --git a/test/CodeGen/mangle-windows.c b/test/CodeGen/mangle-windows.c
index 37d1018283..4a108751aa 100644
--- a/test/CodeGen/mangle-windows.c
+++ b/test/CodeGen/mangle-windows.c
@@ -1,33 +1,68 @@
 // RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 | FileCheck %s
 // RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-mingw32 | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-win32 | FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-mingw32 | FileCheck %s --check-prefix=X64
 
 void __stdcall f1(void) {}
 // CHECK: define x86_stdcallcc void @"\01_f1@0"
+// X64: define void @f1(
 
 void __fastcall f2(void) {}
 // CHECK: define x86_fastcallcc void @"\01@f2@0"
+// X64: define void @f2(
 
 void __stdcall f3() {}
 // CHECK: define x86_stdcallcc void @"\01_f3@0"
+// X64: define void @f3(
 
 void __fastcall f4(char a) {}
 // CHECK: define x86_fastcallcc void @"\01@f4@4"
+// X64: define void @f4(
 
 void __fastcall f5(short a) {}
 // CHECK: define x86_fastcallcc void @"\01@f5@4"
+// X64: define void @f5(
 
 void __fastcall f6(int a) {}
 // CHECK: define x86_fastcallcc void @"\01@f6@4"
+// X64: define void @f6(
 
 void __fastcall f7(long a) {}
 // CHECK: define x86_fastcallcc void @"\01@f7@4"
+// X64: define void @f7(
 
 void __fastcall f8(long long a) {}
 // CHECK: define x86_fastcallcc void @"\01@f8@8"
+// X64: define void @f8(
 
 void __fastcall f9(long long a, char b, char c, short d) {}
-// CHECK: define x86_fastcallcc void @"\01@f9@20"(i64 %a, i8 signext %b, i8
-// signext %c, i16 signext %d)
+// CHECK: define x86_fastcallcc void @"\01@f9@20"(i64 %a, i8 signext %b, i8 signext %c, i16 signext %d)
+// X64: define void @f9(
 
 void f12(void) {}
 // CHECK: define void @f12(
+// X64: define void @f12(
+
+void __vectorcall v1(void) {}
+// CHECK: define x86_vectorcallcc void @"\01v1@@0"(
+// X64: define x86_vectorcallcc void @"\01v1@@0"(
+
+void __vectorcall v2(char a) {}
+// CHECK: define x86_vectorcallcc void @"\01v2@@4"(
+// X64: define x86_vectorcallcc void @"\01v2@@8"(
+
+void __vectorcall v3(short a) {}
+// CHECK: define x86_vectorcallcc void @"\01v3@@4"(
+// X64: define x86_vectorcallcc void @"\01v3@@8"(
+
+void __vectorcall v4(int a) {}
+// CHECK: define x86_vectorcallcc void @"\01v4@@4"(
+// X64: define x86_vectorcallcc void @"\01v4@@8"(
+
+void __vectorcall v5(long long a) {}
+// CHECK: define x86_vectorcallcc void @"\01v5@@8"(
+// X64: define x86_vectorcallcc void @"\01v5@@8"(
+
+void __vectorcall v6(char a, char b) {}
+// CHECK: define x86_vectorcallcc void @"\01v6@@8"(
+// X64: define x86_vectorcallcc void @"\01v6@@16"(
diff --git a/test/CodeGen/microsoft-call-conv.c b/test/CodeGen/microsoft-call-conv.c
index aabe825516..c24db1296f 100644
--- a/test/CodeGen/microsoft-call-conv.c
+++ b/test/CodeGen/microsoft-call-conv.c
@@ -20,9 +20,8 @@ void __thiscall f6(void) {
   f3();
 // CHECK: call x86_thiscallcc void @f3()
 }
-// FIXME: Add this to LLVM.
 void __vectorcall f61(void) {
-// CHECK-LABEL: define void @f61()
+// CHECK-LABEL: define x86_vectorcallcc void @f61()
   f3();
 // CHECK: call x86_thiscallcc void @f3()
 }
@@ -41,7 +40,7 @@ int main(void) {
     // CHECK: call x86_fastcallcc void @f4()
     // CHECK: call x86_stdcallcc void @f5()
     // CHECK: call x86_thiscallcc void @f6()
-    // CHECK: call void @f61()
+    // CHECK: call x86_vectorcallcc void @f61()
     pf1(); pf2(); pf3(); pf4(); pf5(); pf6(); pf7();
     // CHECK: call x86_fastcallcc void %{{.*}}()
     // CHECK: call x86_stdcallcc void %{{.*}}()
@@ -49,7 +48,7 @@ int main(void) {
     // CHECK: call x86_fastcallcc void %{{.*}}()
     // CHECK: call x86_stdcallcc void %{{.*}}()
     // CHECK: call x86_thiscallcc void %{{.*}}()
-    // CHECK: call void %{{.*}}()
+    // CHECK: call x86_vectorcallcc void %{{.*}}()
     return 0;
 }
 
diff --git a/test/CodeGen/vectorcall.c b/test/CodeGen/vectorcall.c
new file mode 100644
index 0000000000..17927c7a3d
--- /dev/null
+++ b/test/CodeGen/vectorcall.c
@@ -0,0 +1,77 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-win32 | FileCheck %s --check-prefix=X64
+
+void __vectorcall v1(int a, int b) {}
+// CHECK: define x86_vectorcallcc void @"\01v1@@8"(i32 inreg %a, i32 inreg %b)
+// X64: define x86_vectorcallcc void @"\01v1@@16"(i32 %a, i32 %b)
+
+void __vectorcall v2(char a, char b) {}
+// CHECK: define x86_vectorcallcc void @"\01v2@@8"(i8 inreg signext %a, i8 inreg signext %b)
+// X64: define x86_vectorcallcc void @"\01v2@@16"(i8 %a, i8 %b)
+
+struct Small { int a; };
+void __vectorcall v3(int a, struct Small b, int c) {}
+// CHECK: define x86_vectorcallcc void @"\01v3@@12"(i32 inreg %a, %struct.Small* byval align 4 %b, i32 inreg %c)
+// X64: define x86_vectorcallcc void @"\01v3@@24"(i32 %a, i32 %b.coerce, i32 %c)
+
+struct Large { int a[5]; };
+void __vectorcall v4(int a, struct Large b, int c) {}
+// CHECK: define x86_vectorcallcc void @"\01v4@@28"(i32 inreg %a, %struct.Large* byval align 4 %b, i32 inreg %c)
+// X64: define x86_vectorcallcc void @"\01v4@@40"(i32 %a, %struct.Large* %b, i32 %c)
+
+struct HFA2 { double x, y; };
+struct HFA4 { double w, x, y, z; };
+struct HFA5 { double v, w, x, y, z; };
+
+void __vectorcall hfa1(int a, struct HFA4 b, int c) {}
+// CHECK: define x86_vectorcallcc void @"\01hfa1@@40"(i32 inreg %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 inreg %c)
+// X64: define x86_vectorcallcc void @"\01hfa1@@48"(i32 %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 %c)
+
+// HFAs that would require more than six total SSE registers are passed
+// indirectly. Additional vector arguments can consume the rest of the SSE
+// registers.
+void __vectorcall hfa2(struct HFA4 a, struct HFA4 b, double c) {}
+// CHECK: define x86_vectorcallcc void @"\01hfa2@@72"(double %a.0, double %a.1, double %a.2, double %a.3, %struct.HFA4* inreg %b, double %c)
+// X64: define x86_vectorcallcc void @"\01hfa2@@72"(double %a.0, double %a.1, double %a.2, double %a.3, %struct.HFA4* align 8 %b, double %c)
+
+// Ensure that we pass builtin types directly while counting them against the
+// SSE register usage.
+void __vectorcall hfa3(double a, double b, double c, double d, double e, struct HFA2 f) {}
+// CHECK: define x86_vectorcallcc void @"\01hfa3@@56"(double %a, double %b, double %c, double %d, double %e, %struct.HFA2* inreg %f)
+// X64: define x86_vectorcallcc void @"\01hfa3@@56"(double %a, double %b, double %c, double %d, double %e, %struct.HFA2* align 8 %f)
+
+// Aggregates with more than four elements are not HFAs and are passed byval.
+// Because they are not classified as homogeneous, they don't get special
+// handling to ensure alignment.
+void __vectorcall hfa4(struct HFA5 a) {}
+// CHECK: define x86_vectorcallcc void @"\01hfa4@@40"(%struct.HFA5* byval align 4)
+// X64: define x86_vectorcallcc void @"\01hfa4@@40"(%struct.HFA5* %a)
+
+// Return HFAs of 4 or fewer elements in registers.
+static struct HFA2 g_hfa2;
+struct HFA2 __vectorcall hfa5(void) { return g_hfa2; }
+// CHECK: define x86_vectorcallcc %struct.HFA2 @"\01hfa5@@0"()
+// X64: define x86_vectorcallcc %struct.HFA2 @"\01hfa5@@0"()
+
+typedef float __attribute__((vector_size(16))) v4f32;
+struct HVA2 { v4f32 x, y; };
+struct HVA4 { v4f32 w, x, y, z; };
+
+void __vectorcall hva1(int a, struct HVA4 b, int c) {}
+// CHECK: define x86_vectorcallcc void @"\01hva1@@72"(i32 inreg %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 inreg %c)
+// X64: define x86_vectorcallcc void @"\01hva1@@80"(i32 %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 %c)
+
+void __vectorcall hva2(struct HVA4 a, struct HVA4 b, v4f32 c) {}
+// CHECK: define x86_vectorcallcc void @"\01hva2@@144"(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, %struct.HVA4* inreg %b, <4 x float> %c)
+// X64: define x86_vectorcallcc void @"\01hva2@@144"(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, %struct.HVA4* align 16 %b, <4 x float> %c)
+
+void __vectorcall hva3(v4f32 a, v4f32 b, v4f32 c, v4f32 d, v4f32 e, struct HVA2 f) {}
+// CHECK: define x86_vectorcallcc void @"\01hva3@@112"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* inreg %f)
+// X64: define x86_vectorcallcc void @"\01hva3@@112"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* align 16 %f)
+
+typedef float __attribute__((ext_vector_type(3))) v3f32;
+struct OddSizeHVA { v3f32 x, y; };
+
+void __vectorcall odd_size_hva(struct OddSizeHVA a) {}
+// CHECK: define x86_vectorcallcc void @"\01odd_size_hva@@32"(<3 x float> %a.0, <3 x float> %a.1)
+// X64: define x86_vectorcallcc void @"\01odd_size_hva@@32"(<3 x float> %a.0, <3 x float> %a.1)
diff --git a/test/CodeGenCXX/homogeneous-aggregates.cpp b/test/CodeGenCXX/homogeneous-aggregates.cpp
index 638184087d..5a34c87cad 100644
--- a/test/CodeGenCXX/homogeneous-aggregates.cpp
+++ b/test/CodeGenCXX/homogeneous-aggregates.cpp
@@ -1,6 +1,13 @@
-// RUNxX: %clang_cc1 -triple powerpc64le-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC
+// RUN: %clang_cc1 -triple powerpc64le-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC
 // RUN: %clang_cc1 -mfloat-abi hard -triple armv7-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=ARM32
 // RUN: %clang_cc1 -mfloat-abi hard -triple aarch64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=ARM64
+// RUN: %clang_cc1 -mfloat-abi hard -triple x86_64-unknown-windows-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=X64
+
+#if defined(__x86_64__)
+#define CC __attribute__((vectorcall))
+#else
+#define CC
+#endif
 
 // Test that C++ classes are correctly classified as homogeneous aggregates.
 
@@ -34,24 +41,26 @@ struct D5 : I1, I2, I3 {}; // homogeneous aggregate
 // PPC: define void @_Z7func_D12D1(%struct.D1* noalias sret %agg.result, [3 x i64] %x.coerce)
 // ARM32: define arm_aapcs_vfpcc void @_Z7func_D12D1(%struct.D1* noalias sret %agg.result, { [3 x i64] } %x.coerce)
 // ARM64: define void @_Z7func_D12D1(%struct.D1* noalias sret %agg.result, %struct.D1* %x)
-D1 func_D1(D1 x) { return x; }
+// X64: define x86_vectorcallcc void @"\01_Z7func_D12D1@@24"(%struct.D1* noalias sret %agg.result, %struct.D1* %x)
+D1 CC func_D1(D1 x) { return x; }
 
 // PPC: define [3 x double] @_Z7func_D22D2([3 x double] %x.coerce)
 // ARM32: define arm_aapcs_vfpcc %struct.D2 @_Z7func_D22D2(%struct.D2 %x.coerce)
 // ARM64: define %struct.D2 @_Z7func_D22D2(double %x.0, double %x.1, double %x.2)
-D2 func_D2(D2 x) { return x; }
+// X64: define x86_vectorcallcc %struct.D2 @"\01_Z7func_D22D2@@24"(double %x.0, double %x.1, double %x.2)
+D2 CC func_D2(D2 x) { return x; }
 
 // PPC: define void @_Z7func_D32D3(%struct.D3* noalias sret %agg.result, [4 x i64] %x.coerce)
 // ARM32: define arm_aapcs_vfpcc void @_Z7func_D32D3(%struct.D3* noalias sret %agg.result, { [4 x i64] } %x.coerce)
 // ARM64: define void @_Z7func_D32D3(%struct.D3* noalias sret %agg.result, %struct.D3* %x)
-D3 func_D3(D3 x) { return x; }
+D3 CC func_D3(D3 x) { return x; }
 
 // PPC: define [4 x double] @_Z7func_D42D4([4 x double] %x.coerce)
 // ARM32: define arm_aapcs_vfpcc %struct.D4 @_Z7func_D42D4(%struct.D4 %x.coerce)
 // ARM64: define %struct.D4 @_Z7func_D42D4(double %x.0, double %x.1, double %x.2, double %x.3)
-D4 func_D4(D4 x) { return x; }
+D4 CC func_D4(D4 x) { return x; }
 
-D5 func_D5(D5 x) { return x; }
+D5 CC func_D5(D5 x) { return x; }
 // PPC: define [3 x double] @_Z7func_D52D5([3 x double] %x.coerce)
 // ARM32: define arm_aapcs_vfpcc %struct.D5 @_Z7func_D52D5(%struct.D5 %x.coerce)
 
@@ -92,3 +101,27 @@ void call_D5(D5 *p) {
 // ARM64: getelementptr inbounds %struct.Base2* %{{.*}}, i32 0, i32 0
 // ARM64: load double*
 // ARM64: call %struct.D5 @_Z7func_D52D5(double %{{.*}}, double %{{.*}}, double %{{.*}})
+
+struct Empty { };
+struct Float1 { float x; };
+struct Float2 { float y; };
+struct HVAWithEmptyBase : Float1, Empty, Float2 { float z; };
+
+// PPC: define void @_Z15with_empty_base16HVAWithEmptyBase([3 x float] %a.coerce)
+// ARM64: define void @_Z15with_empty_base16HVAWithEmptyBase(float %a.0, float %a.1, float %a.2)
+// ARM32: define arm_aapcs_vfpcc void @_Z15with_empty_base16HVAWithEmptyBase(%struct.HVAWithEmptyBase %a.coerce)
+void CC with_empty_base(HVAWithEmptyBase a) {}
+
+// FIXME: MSVC doesn't consider this an HVA becuase of the empty base.
+// X64: define x86_vectorcallcc void @"\01_Z15with_empty_base16HVAWithEmptyBase@@16"(float %a.0, float %a.1, float %a.2)
+
+struct HVAWithEmptyBitField : Float1, Float2 {
+  int : 0; // Takes no space.
+  float z;
+};
+
+// PPC: define void @_Z19with_empty_bitfield20HVAWithEmptyBitField([3 x float] %a.coerce)
+// ARM64: define void @_Z19with_empty_bitfield20HVAWithEmptyBitField(float %a.0, float %a.1, float %a.2)
+// ARM32: define arm_aapcs_vfpcc void @_Z19with_empty_bitfield20HVAWithEmptyBitField(%struct.HVAWithEmptyBitField %a.coerce)
+// X64: define x86_vectorcallcc void @"\01_Z19with_empty_bitfield20HVAWithEmptyBitField@@16"(float %a.0, float %a.1, float %a.2)
+void CC with_empty_bitfield(HVAWithEmptyBitField a) {}
-- 
2.40.0