void MangleContext::anchor() { }
-enum StdOrFastCC {
- SOF_OTHER,
- SOF_FAST,
- SOF_STD
+enum CCMangling {
+ CCM_Other,
+ CCM_Fast,
+ CCM_Vector,
+ CCM_Std
};
static bool isExternC(const NamedDecl *ND) {
return cast<VarDecl>(ND)->isExternC();
}
-static StdOrFastCC getStdOrFastCallMangling(const ASTContext &Context,
- const NamedDecl *ND) {
+static CCMangling getCallingConvMangling(const ASTContext &Context,
+ const NamedDecl *ND) {
const TargetInfo &TI = Context.getTargetInfo();
const llvm::Triple &Triple = TI.getTriple();
- if (!Triple.isOSWindows() || Triple.getArch() != llvm::Triple::x86)
- return SOF_OTHER;
+ if (!Triple.isOSWindows() ||
+ !(Triple.getArch() == llvm::Triple::x86 ||
+ Triple.getArch() == llvm::Triple::x86_64))
+ return CCM_Other;
if (Context.getLangOpts().CPlusPlus && !isExternC(ND) &&
TI.getCXXABI() == TargetCXXABI::Microsoft)
- return SOF_OTHER;
+ return CCM_Other;
const FunctionDecl *FD = dyn_cast<FunctionDecl>(ND);
if (!FD)
- return SOF_OTHER;
+ return CCM_Other;
QualType T = FD->getType();
const FunctionType *FT = T->castAs<FunctionType>();
CallingConv CC = FT->getCallConv();
switch (CC) {
default:
- return SOF_OTHER;
+ return CCM_Other;
case CC_X86FastCall:
- return SOF_FAST;
+ return CCM_Fast;
case CC_X86StdCall:
- return SOF_STD;
+ return CCM_Std;
+ case CC_X86VectorCall:
+ return CCM_Vector;
}
}
bool MangleContext::shouldMangleDeclName(const NamedDecl *D) {
const ASTContext &ASTContext = getASTContext();
- StdOrFastCC CC = getStdOrFastCallMangling(ASTContext, D);
- if (CC != SOF_OTHER)
+ CCMangling CC = getCallingConvMangling(ASTContext, D);
+ if (CC != CCM_Other)
return true;
// In C, functions with no attributes never need to be mangled. Fastpath them.
}
const ASTContext &ASTContext = getASTContext();
- StdOrFastCC CC = getStdOrFastCallMangling(ASTContext, D);
+ CCMangling CC = getCallingConvMangling(ASTContext, D);
bool MCXX = shouldMangleCXXName(D);
const TargetInfo &TI = Context.getTargetInfo();
- if (CC == SOF_OTHER || (MCXX && TI.getCXXABI() == TargetCXXABI::Microsoft)) {
+ if (CC == CCM_Other || (MCXX && TI.getCXXABI() == TargetCXXABI::Microsoft)) {
if (const ObjCMethodDecl *OMD = dyn_cast<ObjCMethodDecl>(D))
mangleObjCMethodName(OMD, Out);
else
}
Out << '\01';
- if (CC == SOF_STD)
+ if (CC == CCM_Std)
Out << '_';
- else
+ else if (CC == CCM_Fast)
Out << '@';
if (!MCXX)
const FunctionDecl *FD = cast<FunctionDecl>(D);
const FunctionType *FT = FD->getType()->castAs<FunctionType>();
const FunctionProtoType *Proto = dyn_cast<FunctionProtoType>(FT);
+ if (CC == CCM_Vector)
+ Out << '@';
Out << '@';
if (!Proto) {
Out << '0';
if (!MD->isStatic())
++ArgWords;
for (const auto &AT : Proto->param_types())
- // Size should be aligned to DWORD boundary
- ArgWords += llvm::RoundUpToAlignment(ASTContext.getTypeSize(AT), 32) / 32;
- Out << 4 * ArgWords;
+ // Size should be aligned to pointer size.
+ ArgWords += llvm::RoundUpToAlignment(ASTContext.getTypeSize(AT),
+ TI.getPointerWidth(0)) /
+ TI.getPointerWidth(0);
+ Out << ((TI.getPointerWidth(0) / 8) * ArgWords);
}
void MangleContext::mangleGlobalBlock(const BlockDecl *BD,
CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
return (CC == CC_C ||
+ CC == CC_X86VectorCall ||
CC == CC_IntelOclBicc ||
CC == CC_X86_64Win64) ? CCCR_OK : CCCR_Warning;
}
}
CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
return (CC == CC_C ||
+ CC == CC_X86VectorCall ||
CC == CC_IntelOclBicc ||
CC == CC_X86_64SysV) ? CCCR_OK : CCCR_Warning;
}
// TODO: Add support for __pascal to LLVM.
case CC_X86Pascal: return llvm::CallingConv::C;
// TODO: Add support for __vectorcall to LLVM.
- case CC_X86VectorCall: return llvm::CallingConv::C;
+ case CC_X86VectorCall: return llvm::CallingConv::X86_VectorCall;
}
}
CharUnits UnionSize = CharUnits::Zero();
for (const auto *FD : RD->fields()) {
+ // Skip zero length bitfields.
+ if (FD->isBitField() && FD->getBitWidthValue(Context) == 0)
+ continue;
assert(!FD->isBitField() &&
"Cannot expand structure with bit-field members.");
CharUnits FieldSize = Context.getTypeSizeInChars(FD->getType());
}
for (const auto *FD : RD->fields()) {
+ // Skip zero length bitfields.
+ if (FD->isBitField() && FD->getBitWidthValue(Context) == 0)
+ continue;
assert(!FD->isBitField() &&
"Cannot expand structure with bit-field members.");
Fields.push_back(FD);
if (RD->hasNonTrivialCopyConstructor())
return RAA_Indirect;
- // Win64 passes objects larger than 8 bytes indirectly.
- if (getContext().getTypeSize(RD->getTypeForDecl()) > 64)
+ // If an object has a destructor, we'd really like to pass it indirectly
+ // because it allows us to elide copies. Unfortunately, MSVC makes that
+ // impossible for small types, which it will pass in a single register or
+ // stack slot. Most objects with dtors are large-ish, so handle that early.
+ // We can't call out all large objects as being indirect because there are
+ // multiple x64 calling conventions and the C++ ABI code shouldn't dictate
+ // how we pass large POD types.
+ if (RD->hasNonTrivialDestructor() &&
+ getContext().getTypeSize(RD->getTypeForDecl()) > 64)
return RAA_Indirect;
// We have a trivial copy constructor or no copy constructors, but we have
return Ty;
}
+/// Returns true if this type can be passed in SSE registers with the
+/// X86_VectorCall calling convention. Shared between x86_32 and x86_64.
+static bool isX86VectorTypeForVectorCall(ASTContext &Context, QualType Ty) {
+ if (const BuiltinType *BT = Ty->getAs<BuiltinType>()) {
+ if (BT->isFloatingPoint() && BT->getKind() != BuiltinType::Half)
+ return true;
+ } else if (const VectorType *VT = Ty->getAs<VectorType>()) {
+ // vectorcall can pass XMM, YMM, and ZMM vectors. We don't pass SSE1 MMX
+ // registers specially.
+ unsigned VecSize = Context.getTypeSize(VT);
+ if (VecSize == 128 || VecSize == 256 || VecSize == 512)
+ return true;
+ }
+ return false;
+}
+
+/// Returns true if this aggregate is small enough to be passed in SSE registers
+/// in the X86_VectorCall calling convention. Shared between x86_32 and x86_64.
+static bool isX86VectorCallAggregateSmallEnough(uint64_t NumMembers) {
+ return NumMembers <= 4;
+}
+
//===----------------------------------------------------------------------===//
// X86-32 ABI Implementation
//===----------------------------------------------------------------------===//
/// \brief Similar to llvm::CCState, but for Clang.
struct CCState {
- CCState(unsigned CC) : CC(CC), FreeRegs(0) {}
+ CCState(unsigned CC) : CC(CC), FreeRegs(0), FreeSSERegs(0) {}
unsigned CC;
unsigned FreeRegs;
- unsigned StackOffset;
- bool UseInAlloca;
+ unsigned FreeSSERegs;
};
/// X86_32ABIInfo - The X86-32 ABI information.
return (Size == 8 || Size == 16 || Size == 32 || Size == 64);
}
+ bool isHomogeneousAggregateBaseType(QualType Ty) const override {
+ // FIXME: Assumes vectorcall is in use.
+ return isX86VectorTypeForVectorCall(getContext(), Ty);
+ }
+
+ bool isHomogeneousAggregateSmallEnough(const Type *Ty,
+ uint64_t NumMembers) const override {
+ // FIXME: Assumes vectorcall is in use.
+ return isX86VectorCallAggregateSmallEnough(NumMembers);
+ }
+
bool shouldReturnTypeInRegister(QualType Ty, ASTContext &Context) const;
/// getIndirectResult - Give a source type \arg Ty, return a suitable result
if (RetTy->isVoidType())
return ABIArgInfo::getIgnore();
+ const Type *Base = nullptr;
+ uint64_t NumElts = 0;
+ if (State.CC == llvm::CallingConv::X86_VectorCall &&
+ isHomogeneousAggregate(RetTy, Base, NumElts)) {
+ // The LLVM struct type for such an aggregate should lower properly.
+ return ABIArgInfo::getDirect();
+ }
+
if (const VectorType *VT = RetTy->getAs<VectorType>()) {
// On Darwin, some vectors are returned in registers.
if (IsDarwinVectorABI) {
State.FreeRegs -= SizeInRegs;
- if (State.CC == llvm::CallingConv::X86_FastCall) {
+ if (State.CC == llvm::CallingConv::X86_FastCall ||
+ State.CC == llvm::CallingConv::X86_VectorCall) {
if (Size > 32)
return false;
ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
CCState &State) const {
// FIXME: Set alignment on indirect arguments.
- if (isAggregateTypeForABI(Ty)) {
- if (const RecordType *RT = Ty->getAs<RecordType>()) {
- // Check with the C++ ABI first.
- CGCXXABI::RecordArgABI RAA = getRecordArgABI(RT, getCXXABI());
- if (RAA == CGCXXABI::RAA_Indirect) {
- return getIndirectResult(Ty, false, State);
- } else if (RAA == CGCXXABI::RAA_DirectInMemory) {
- // The field index doesn't matter, we'll fix it up later.
- return ABIArgInfo::getInAlloca(/*FieldIndex=*/0);
- }
+ // Check with the C++ ABI first.
+ const RecordType *RT = Ty->getAs<RecordType>();
+ if (RT) {
+ CGCXXABI::RecordArgABI RAA = getRecordArgABI(RT, getCXXABI());
+ if (RAA == CGCXXABI::RAA_Indirect) {
+ return getIndirectResult(Ty, false, State);
+ } else if (RAA == CGCXXABI::RAA_DirectInMemory) {
+ // The field index doesn't matter, we'll fix it up later.
+ return ABIArgInfo::getInAlloca(/*FieldIndex=*/0);
+ }
+ }
+
+ // vectorcall adds the concept of a homogenous vector aggregate, similar
+ // to other targets.
+ const Type *Base = nullptr;
+ uint64_t NumElts = 0;
+ if (State.CC == llvm::CallingConv::X86_VectorCall &&
+ isHomogeneousAggregate(Ty, Base, NumElts)) {
+ if (State.FreeSSERegs >= NumElts) {
+ State.FreeSSERegs -= NumElts;
+ if (Ty->isBuiltinType() || Ty->isVectorType())
+ return ABIArgInfo::getDirect();
+ return ABIArgInfo::getExpand();
+ }
+ return getIndirectResult(Ty, /*ByVal=*/false, State);
+ }
+
+ if (isAggregateTypeForABI(Ty)) {
+ if (RT) {
// Structs are always byval on win32, regardless of what they contain.
if (IsWin32StructABI)
return getIndirectResult(Ty, true, State);
if (getContext().getTypeSize(Ty) <= 4*32 &&
canExpandIndirectArgument(Ty, getContext()))
return ABIArgInfo::getExpandWithPadding(
- State.CC == llvm::CallingConv::X86_FastCall, PaddingType);
+ State.CC == llvm::CallingConv::X86_FastCall ||
+ State.CC == llvm::CallingConv::X86_VectorCall,
+ PaddingType);
return getIndirectResult(Ty, true, State);
}
CCState State(FI.getCallingConvention());
if (State.CC == llvm::CallingConv::X86_FastCall)
State.FreeRegs = 2;
- else if (FI.getHasRegParm())
+ else if (State.CC == llvm::CallingConv::X86_VectorCall) {
+ State.FreeRegs = 2;
+ State.FreeSSERegs = 6;
+ } else if (FI.getHasRegParm())
State.FreeRegs = FI.getRegParm();
else
State.FreeRegs = DefaultNumRegisterParameters;
/// WinX86_64ABIInfo - The Windows X86_64 ABI information.
class WinX86_64ABIInfo : public ABIInfo {
- ABIArgInfo classify(QualType Ty, bool IsReturnType) const;
+ ABIArgInfo classify(QualType Ty, unsigned &FreeSSERegs,
+ bool IsReturnType) const;
public:
WinX86_64ABIInfo(CodeGen::CodeGenTypes &CGT) : ABIInfo(CGT) {}
llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
CodeGenFunction &CGF) const override;
+
+ bool isHomogeneousAggregateBaseType(QualType Ty) const override {
+ // FIXME: Assumes vectorcall is in use.
+ return isX86VectorTypeForVectorCall(getContext(), Ty);
+ }
+
+ bool isHomogeneousAggregateSmallEnough(const Type *Ty,
+ uint64_t NumMembers) const override {
+ // FIXME: Assumes vectorcall is in use.
+ return isX86VectorCallAggregateSmallEnough(NumMembers);
+ }
};
class X86_64TargetCodeGenInfo : public TargetCodeGenInfo {
return ResAddr;
}
-ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, bool IsReturnType) const {
+ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
+ bool IsReturnType) const {
if (Ty->isVoidType())
return ABIArgInfo::getIgnore();
if (const EnumType *EnumTy = Ty->getAs<EnumType>())
Ty = EnumTy->getDecl()->getIntegerType();
- uint64_t Size = getContext().getTypeSize(Ty);
+ TypeInfo Info = getContext().getTypeInfo(Ty);
+ uint64_t Width = Info.Width;
+ unsigned Align = getContext().toCharUnitsFromBits(Info.Align).getQuantity();
const RecordType *RT = Ty->getAs<RecordType>();
if (RT) {
return ABIArgInfo::getIndirect(0, /*ByVal=*/false);
// FIXME: mingw-w64-gcc emits 128-bit struct as i128
- if (Size == 128 && getTarget().getTriple().isWindowsGNUEnvironment())
+ if (Width == 128 && getTarget().getTriple().isWindowsGNUEnvironment())
return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(),
- Size));
+ Width));
+ }
+
+ // vectorcall adds the concept of a homogenous vector aggregate, similar to
+ // other targets.
+ const Type *Base = nullptr;
+ uint64_t NumElts = 0;
+ if (FreeSSERegs && isHomogeneousAggregate(Ty, Base, NumElts)) {
+ if (FreeSSERegs >= NumElts) {
+ FreeSSERegs -= NumElts;
+ if (IsReturnType || Ty->isBuiltinType() || Ty->isVectorType())
+ return ABIArgInfo::getDirect();
+ return ABIArgInfo::getExpand();
+ }
+ return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
}
+
if (Ty->isMemberPointerType()) {
// If the member pointer is represented by an LLVM int or ptr, pass it
// directly.
if (RT || Ty->isMemberPointerType()) {
// MS x64 ABI requirement: "Any argument that doesn't fit in 8 bytes, or is
// not 1, 2, 4, or 8 bytes, must be passed by reference."
- if (Size > 64 || !llvm::isPowerOf2_64(Size))
+ if (Width > 64 || !llvm::isPowerOf2_64(Width))
return ABIArgInfo::getIndirect(0, /*ByVal=*/false);
// Otherwise, coerce it to a small integer.
- return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Size));
+ return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Width));
}
// Bool type is always extended to the ABI, other builtin types are not
}
void WinX86_64ABIInfo::computeInfo(CGFunctionInfo &FI) const {
+ bool IsVectorCall =
+ FI.getCallingConvention() == llvm::CallingConv::X86_VectorCall;
+
+ // We can use up to 4 SSE return registers with vectorcall.
+ unsigned FreeSSERegs = IsVectorCall ? 4 : 0;
if (!getCXXABI().classifyReturnType(FI))
- FI.getReturnInfo() = classify(FI.getReturnType(), true);
+ FI.getReturnInfo() = classify(FI.getReturnType(), FreeSSERegs, true);
+ // We can use up to 6 SSE register parameters with vectorcall.
+ FreeSSERegs = IsVectorCall ? 6 : 0;
for (auto &I : FI.arguments())
- I.info = classify(I.type, false);
+ I.info = classify(I.type, FreeSSERegs, false);
}
llvm::Value *WinX86_64ABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 | FileCheck %s
// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-mingw32 | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-win32 | FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-mingw32 | FileCheck %s --check-prefix=X64
void __stdcall f1(void) {}
// CHECK: define x86_stdcallcc void @"\01_f1@0"
+// X64: define void @f1(
void __fastcall f2(void) {}
// CHECK: define x86_fastcallcc void @"\01@f2@0"
+// X64: define void @f2(
void __stdcall f3() {}
// CHECK: define x86_stdcallcc void @"\01_f3@0"
+// X64: define void @f3(
void __fastcall f4(char a) {}
// CHECK: define x86_fastcallcc void @"\01@f4@4"
+// X64: define void @f4(
void __fastcall f5(short a) {}
// CHECK: define x86_fastcallcc void @"\01@f5@4"
+// X64: define void @f5(
void __fastcall f6(int a) {}
// CHECK: define x86_fastcallcc void @"\01@f6@4"
+// X64: define void @f6(
void __fastcall f7(long a) {}
// CHECK: define x86_fastcallcc void @"\01@f7@4"
+// X64: define void @f7(
void __fastcall f8(long long a) {}
// CHECK: define x86_fastcallcc void @"\01@f8@8"
+// X64: define void @f8(
void __fastcall f9(long long a, char b, char c, short d) {}
-// CHECK: define x86_fastcallcc void @"\01@f9@20"(i64 %a, i8 signext %b, i8
-// signext %c, i16 signext %d)
+// CHECK: define x86_fastcallcc void @"\01@f9@20"(i64 %a, i8 signext %b, i8 signext %c, i16 signext %d)
+// X64: define void @f9(
void f12(void) {}
// CHECK: define void @f12(
+// X64: define void @f12(
+
+void __vectorcall v1(void) {}
+// CHECK: define x86_vectorcallcc void @"\01v1@@0"(
+// X64: define x86_vectorcallcc void @"\01v1@@0"(
+
+void __vectorcall v2(char a) {}
+// CHECK: define x86_vectorcallcc void @"\01v2@@4"(
+// X64: define x86_vectorcallcc void @"\01v2@@8"(
+
+void __vectorcall v3(short a) {}
+// CHECK: define x86_vectorcallcc void @"\01v3@@4"(
+// X64: define x86_vectorcallcc void @"\01v3@@8"(
+
+void __vectorcall v4(int a) {}
+// CHECK: define x86_vectorcallcc void @"\01v4@@4"(
+// X64: define x86_vectorcallcc void @"\01v4@@8"(
+
+void __vectorcall v5(long long a) {}
+// CHECK: define x86_vectorcallcc void @"\01v5@@8"(
+// X64: define x86_vectorcallcc void @"\01v5@@8"(
+
+void __vectorcall v6(char a, char b) {}
+// CHECK: define x86_vectorcallcc void @"\01v6@@8"(
+// X64: define x86_vectorcallcc void @"\01v6@@16"(
f3();
// CHECK: call x86_thiscallcc void @f3()
}
-// FIXME: Add this to LLVM.
void __vectorcall f61(void) {
-// CHECK-LABEL: define void @f61()
+// CHECK-LABEL: define x86_vectorcallcc void @f61()
f3();
// CHECK: call x86_thiscallcc void @f3()
}
// CHECK: call x86_fastcallcc void @f4()
// CHECK: call x86_stdcallcc void @f5()
// CHECK: call x86_thiscallcc void @f6()
- // CHECK: call void @f61()
+ // CHECK: call x86_vectorcallcc void @f61()
pf1(); pf2(); pf3(); pf4(); pf5(); pf6(); pf7();
// CHECK: call x86_fastcallcc void %{{.*}}()
// CHECK: call x86_stdcallcc void %{{.*}}()
// CHECK: call x86_fastcallcc void %{{.*}}()
// CHECK: call x86_stdcallcc void %{{.*}}()
// CHECK: call x86_thiscallcc void %{{.*}}()
- // CHECK: call void %{{.*}}()
+ // CHECK: call x86_vectorcallcc void %{{.*}}()
return 0;
}
--- /dev/null
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-win32 | FileCheck %s --check-prefix=X64
+
+void __vectorcall v1(int a, int b) {}
+// CHECK: define x86_vectorcallcc void @"\01v1@@8"(i32 inreg %a, i32 inreg %b)
+// X64: define x86_vectorcallcc void @"\01v1@@16"(i32 %a, i32 %b)
+
+void __vectorcall v2(char a, char b) {}
+// CHECK: define x86_vectorcallcc void @"\01v2@@8"(i8 inreg signext %a, i8 inreg signext %b)
+// X64: define x86_vectorcallcc void @"\01v2@@16"(i8 %a, i8 %b)
+
+struct Small { int a; };
+void __vectorcall v3(int a, struct Small b, int c) {}
+// CHECK: define x86_vectorcallcc void @"\01v3@@12"(i32 inreg %a, %struct.Small* byval align 4 %b, i32 inreg %c)
+// X64: define x86_vectorcallcc void @"\01v3@@24"(i32 %a, i32 %b.coerce, i32 %c)
+
+struct Large { int a[5]; };
+void __vectorcall v4(int a, struct Large b, int c) {}
+// CHECK: define x86_vectorcallcc void @"\01v4@@28"(i32 inreg %a, %struct.Large* byval align 4 %b, i32 inreg %c)
+// X64: define x86_vectorcallcc void @"\01v4@@40"(i32 %a, %struct.Large* %b, i32 %c)
+
+struct HFA2 { double x, y; };
+struct HFA4 { double w, x, y, z; };
+struct HFA5 { double v, w, x, y, z; };
+
+void __vectorcall hfa1(int a, struct HFA4 b, int c) {}
+// CHECK: define x86_vectorcallcc void @"\01hfa1@@40"(i32 inreg %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 inreg %c)
+// X64: define x86_vectorcallcc void @"\01hfa1@@48"(i32 %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 %c)
+
+// HFAs that would require more than six total SSE registers are passed
+// indirectly. Additional vector arguments can consume the rest of the SSE
+// registers.
+void __vectorcall hfa2(struct HFA4 a, struct HFA4 b, double c) {}
+// CHECK: define x86_vectorcallcc void @"\01hfa2@@72"(double %a.0, double %a.1, double %a.2, double %a.3, %struct.HFA4* inreg %b, double %c)
+// X64: define x86_vectorcallcc void @"\01hfa2@@72"(double %a.0, double %a.1, double %a.2, double %a.3, %struct.HFA4* align 8 %b, double %c)
+
+// Ensure that we pass builtin types directly while counting them against the
+// SSE register usage.
+void __vectorcall hfa3(double a, double b, double c, double d, double e, struct HFA2 f) {}
+// CHECK: define x86_vectorcallcc void @"\01hfa3@@56"(double %a, double %b, double %c, double %d, double %e, %struct.HFA2* inreg %f)
+// X64: define x86_vectorcallcc void @"\01hfa3@@56"(double %a, double %b, double %c, double %d, double %e, %struct.HFA2* align 8 %f)
+
+// Aggregates with more than four elements are not HFAs and are passed byval.
+// Because they are not classified as homogeneous, they don't get special
+// handling to ensure alignment.
+void __vectorcall hfa4(struct HFA5 a) {}
+// CHECK: define x86_vectorcallcc void @"\01hfa4@@40"(%struct.HFA5* byval align 4)
+// X64: define x86_vectorcallcc void @"\01hfa4@@40"(%struct.HFA5* %a)
+
+// Return HFAs of 4 or fewer elements in registers.
+static struct HFA2 g_hfa2;
+struct HFA2 __vectorcall hfa5(void) { return g_hfa2; }
+// CHECK: define x86_vectorcallcc %struct.HFA2 @"\01hfa5@@0"()
+// X64: define x86_vectorcallcc %struct.HFA2 @"\01hfa5@@0"()
+
+typedef float __attribute__((vector_size(16))) v4f32;
+struct HVA2 { v4f32 x, y; };
+struct HVA4 { v4f32 w, x, y, z; };
+
+void __vectorcall hva1(int a, struct HVA4 b, int c) {}
+// CHECK: define x86_vectorcallcc void @"\01hva1@@72"(i32 inreg %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 inreg %c)
+// X64: define x86_vectorcallcc void @"\01hva1@@80"(i32 %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 %c)
+
+void __vectorcall hva2(struct HVA4 a, struct HVA4 b, v4f32 c) {}
+// CHECK: define x86_vectorcallcc void @"\01hva2@@144"(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, %struct.HVA4* inreg %b, <4 x float> %c)
+// X64: define x86_vectorcallcc void @"\01hva2@@144"(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, %struct.HVA4* align 16 %b, <4 x float> %c)
+
+void __vectorcall hva3(v4f32 a, v4f32 b, v4f32 c, v4f32 d, v4f32 e, struct HVA2 f) {}
+// CHECK: define x86_vectorcallcc void @"\01hva3@@112"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* inreg %f)
+// X64: define x86_vectorcallcc void @"\01hva3@@112"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* align 16 %f)
+
+typedef float __attribute__((ext_vector_type(3))) v3f32;
+struct OddSizeHVA { v3f32 x, y; };
+
+void __vectorcall odd_size_hva(struct OddSizeHVA a) {}
+// CHECK: define x86_vectorcallcc void @"\01odd_size_hva@@32"(<3 x float> %a.0, <3 x float> %a.1)
+// X64: define x86_vectorcallcc void @"\01odd_size_hva@@32"(<3 x float> %a.0, <3 x float> %a.1)
-// RUNxX: %clang_cc1 -triple powerpc64le-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC
+// RUN: %clang_cc1 -triple powerpc64le-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC
// RUN: %clang_cc1 -mfloat-abi hard -triple armv7-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=ARM32
// RUN: %clang_cc1 -mfloat-abi hard -triple aarch64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=ARM64
+// RUN: %clang_cc1 -mfloat-abi hard -triple x86_64-unknown-windows-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=X64
+
+#if defined(__x86_64__)
+#define CC __attribute__((vectorcall))
+#else
+#define CC
+#endif
// Test that C++ classes are correctly classified as homogeneous aggregates.
// PPC: define void @_Z7func_D12D1(%struct.D1* noalias sret %agg.result, [3 x i64] %x.coerce)
// ARM32: define arm_aapcs_vfpcc void @_Z7func_D12D1(%struct.D1* noalias sret %agg.result, { [3 x i64] } %x.coerce)
// ARM64: define void @_Z7func_D12D1(%struct.D1* noalias sret %agg.result, %struct.D1* %x)
-D1 func_D1(D1 x) { return x; }
+// X64: define x86_vectorcallcc void @"\01_Z7func_D12D1@@24"(%struct.D1* noalias sret %agg.result, %struct.D1* %x)
+D1 CC func_D1(D1 x) { return x; }
// PPC: define [3 x double] @_Z7func_D22D2([3 x double] %x.coerce)
// ARM32: define arm_aapcs_vfpcc %struct.D2 @_Z7func_D22D2(%struct.D2 %x.coerce)
// ARM64: define %struct.D2 @_Z7func_D22D2(double %x.0, double %x.1, double %x.2)
-D2 func_D2(D2 x) { return x; }
+// X64: define x86_vectorcallcc %struct.D2 @"\01_Z7func_D22D2@@24"(double %x.0, double %x.1, double %x.2)
+D2 CC func_D2(D2 x) { return x; }
// PPC: define void @_Z7func_D32D3(%struct.D3* noalias sret %agg.result, [4 x i64] %x.coerce)
// ARM32: define arm_aapcs_vfpcc void @_Z7func_D32D3(%struct.D3* noalias sret %agg.result, { [4 x i64] } %x.coerce)
// ARM64: define void @_Z7func_D32D3(%struct.D3* noalias sret %agg.result, %struct.D3* %x)
-D3 func_D3(D3 x) { return x; }
+D3 CC func_D3(D3 x) { return x; }
// PPC: define [4 x double] @_Z7func_D42D4([4 x double] %x.coerce)
// ARM32: define arm_aapcs_vfpcc %struct.D4 @_Z7func_D42D4(%struct.D4 %x.coerce)
// ARM64: define %struct.D4 @_Z7func_D42D4(double %x.0, double %x.1, double %x.2, double %x.3)
-D4 func_D4(D4 x) { return x; }
+D4 CC func_D4(D4 x) { return x; }
-D5 func_D5(D5 x) { return x; }
+D5 CC func_D5(D5 x) { return x; }
// PPC: define [3 x double] @_Z7func_D52D5([3 x double] %x.coerce)
// ARM32: define arm_aapcs_vfpcc %struct.D5 @_Z7func_D52D5(%struct.D5 %x.coerce)
// ARM64: getelementptr inbounds %struct.Base2* %{{.*}}, i32 0, i32 0
// ARM64: load double*
// ARM64: call %struct.D5 @_Z7func_D52D5(double %{{.*}}, double %{{.*}}, double %{{.*}})
+
+struct Empty { };
+struct Float1 { float x; };
+struct Float2 { float y; };
+struct HVAWithEmptyBase : Float1, Empty, Float2 { float z; };
+
+// PPC: define void @_Z15with_empty_base16HVAWithEmptyBase([3 x float] %a.coerce)
+// ARM64: define void @_Z15with_empty_base16HVAWithEmptyBase(float %a.0, float %a.1, float %a.2)
+// ARM32: define arm_aapcs_vfpcc void @_Z15with_empty_base16HVAWithEmptyBase(%struct.HVAWithEmptyBase %a.coerce)
+void CC with_empty_base(HVAWithEmptyBase a) {}
+
+// FIXME: MSVC doesn't consider this an HVA becuase of the empty base.
+// X64: define x86_vectorcallcc void @"\01_Z15with_empty_base16HVAWithEmptyBase@@16"(float %a.0, float %a.1, float %a.2)
+
+struct HVAWithEmptyBitField : Float1, Float2 {
+ int : 0; // Takes no space.
+ float z;
+};
+
+// PPC: define void @_Z19with_empty_bitfield20HVAWithEmptyBitField([3 x float] %a.coerce)
+// ARM64: define void @_Z19with_empty_bitfield20HVAWithEmptyBitField(float %a.0, float %a.1, float %a.2)
+// ARM32: define arm_aapcs_vfpcc void @_Z19with_empty_bitfield20HVAWithEmptyBitField(%struct.HVAWithEmptyBitField %a.coerce)
+// X64: define x86_vectorcallcc void @"\01_Z19with_empty_bitfield20HVAWithEmptyBitField@@16"(float %a.0, float %a.1, float %a.2)
+void CC with_empty_bitfield(HVAWithEmptyBitField a) {}