From dc04fc16d68919f4f8cb1acb50740495126190e5 Mon Sep 17 00:00:00 2001
From: Simon Dardis <simon.dardis@imgtec.com>
Date: Fri, 7 Apr 2017 13:03:52 +0000
Subject: [PATCH] [SelectionDAG] Enable target specific vector scalarization of
 calls and returns

By target hookifying getRegisterType, getNumRegisters, getVectorBreakdown,
backends can request that LLVM to scalarize vector types for calls
and returns.

The MIPS vector ABI requires that vector arguments and returns are passed in
integer registers. With SelectionDAG's new hooks, the MIPS backend can now
handle LLVM-IR with vector types in calls and returns. E.g.
'call @foo(<4 x i32> %4)'.

Previously these cases would be scalarized for the MIPS O32/N32/N64 ABI for
calls and returns if vector types were not legal. If vector types were legal,
a single 128bit vector argument would be assigned to a single 32 bit / 64 bit
integer register.

By teaching the MIPS backend to inspect the original types, it can now
implement the MIPS vector ABI which requires a particular method of
scalarizing vectors.

Previously, the MIPS backend relied on clang to scalarize types such as "call
@foo(<4 x float> %a) into "call @foo(i32 inreg %1, i32 inreg %2, i32 inreg %3,
i32 inreg %4)".

This patch enables the MIPS backend to take either form for vector types.

Reviewers: zoran.jovanovic, jaydeep, vkalintiris, slthakur

Differential Revision: https://reviews.llvm.org/D27845


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@299766 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetLowering.h          |   37 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  232 ++-
 .../SelectionDAG/SelectionDAGBuilder.h        |   14 +-
 .../SelectionDAG/StatepointLowering.cpp       |    2 +-
 lib/CodeGen/TargetLoweringBase.cpp            |    6 +-
 lib/Target/Mips/MipsCCState.cpp               |   64 +-
 lib/Target/Mips/MipsCCState.h                 |   34 +-
 lib/Target/Mips/MipsCallingConv.td            |   10 +-
 lib/Target/Mips/MipsISelLowering.cpp          |   74 +-
 lib/Target/Mips/MipsISelLowering.h            |   27 +
 lib/Target/Mips/MipsRegisterInfo.cpp          |    4 +-
 test/CodeGen/Mips/cconv/vector.ll             | 1657 +++++++++++++++++
 test/CodeGen/Mips/ctlz-v.ll                   |   12 +-
 test/CodeGen/Mips/cttz-v.ll                   |   19 +-
 test/CodeGen/Mips/return-vector.ll            |   33 +-
 15 files changed, 2121 insertions(+), 104 deletions(-)
 create mode 100644 test/CodeGen/Mips/cconv/vector.ll

diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 240896a538f..6a350a2169b 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -662,6 +662,16 @@ public:
                                   unsigned &NumIntermediates,
                                   MVT &RegisterVT) const;
 
+  /// Certain targets such as MIPS require that some types such as vectors are
+  /// always broken down into scalars in some contexts. This occurs even if the
+  /// vector type is legal.
+  virtual unsigned getVectorTypeBreakdownForCallingConv(
+      LLVMContext &Context, EVT VT, EVT &IntermediateVT,
+      unsigned &NumIntermediates, MVT &RegisterVT) const {
+    return getVectorTypeBreakdown(Context, VT, IntermediateVT, NumIntermediates,
+                                  RegisterVT);
+  }
+
   struct IntrinsicInfo {
     unsigned     opc = 0;          // target opcode
     EVT          memVT;            // memory VT
@@ -1002,6 +1012,33 @@ public:
     llvm_unreachable("Unsupported extended type!");
   }
 
+  /// Certain combinations of ABIs, Targets and features require that types
+  /// are legal for some operations and not for other operations.
+  /// For MIPS all vector types must be passed through the integer register set.
+  virtual MVT getRegisterTypeForCallingConv(MVT VT) const {
+    return getRegisterType(VT);
+  }
+
+  virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context,
+                                            EVT VT) const {
+    return getRegisterType(Context, VT);
+  }
+
+  /// Certain targets require unusual breakdowns of certain types. For MIPS,
+  /// this occurs when a vector type is used, as vector are passed through the
+  /// integer register set.
+  virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+                                                 EVT VT) const {
+    return getNumRegisters(Context, VT);
+  }
+
+  /// Certain targets have context senstive alignment requirements, where one
+  /// type has the alignment requirement of another type.
+  virtual unsigned getABIAlignmentForCallingConv(Type *ArgTy,
+                                                 DataLayout DL) const {
+    return DL.getABITypeAlignment(ArgTy);
+  }
+
   /// If true, then instruction selection should seek to shrink the FP constant
   /// of the specified type to a smaller type in order to save space and / or
   /// reduce runtime.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 315d841cf3c..43887a2e348 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -115,7 +115,8 @@ static const unsigned MaxParallelChains = 64;
 
 static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
                                       const SDValue *Parts, unsigned NumParts,
-                                      MVT PartVT, EVT ValueVT, const Value *V);
+                                      MVT PartVT, EVT ValueVT, const Value *V,
+                                      bool IsABIRegCopy);
 
 /// getCopyFromParts - Create a value that contains the specified legal parts
 /// combined into the value they represent.  If the parts combine to a type
@@ -125,10 +126,11 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
 static SDValue getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL,
                                 const SDValue *Parts, unsigned NumParts,
                                 MVT PartVT, EVT ValueVT, const Value *V,
-                                Optional<ISD::NodeType> AssertOp = None) {
+                                Optional<ISD::NodeType> AssertOp = None,
+                                bool IsABIRegCopy = false) {
   if (ValueVT.isVector())
     return getCopyFromPartsVector(DAG, DL, Parts, NumParts,
-                                  PartVT, ValueVT, V);
+                                  PartVT, ValueVT, V, IsABIRegCopy);
 
   assert(NumParts > 0 && "No parts to assemble!");
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -272,7 +274,8 @@ static void diagnosePossiblyInvalidConstraint(LLVMContext &Ctx, const Value *V,
 /// ValueVT (ISD::AssertSext).
 static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
                                       const SDValue *Parts, unsigned NumParts,
-                                      MVT PartVT, EVT ValueVT, const Value *V) {
+                                      MVT PartVT, EVT ValueVT, const Value *V,
+                                      bool IsABIRegCopy) {
   assert(ValueVT.isVector() && "Not a vector value");
   assert(NumParts > 0 && "No parts to assemble!");
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -283,9 +286,18 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
     EVT IntermediateVT;
     MVT RegisterVT;
     unsigned NumIntermediates;
-    unsigned NumRegs =
-    TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
-                               NumIntermediates, RegisterVT);
+    unsigned NumRegs;
+
+    if (IsABIRegCopy) {
+      NumRegs = TLI.getVectorTypeBreakdownForCallingConv(
+          *DAG.getContext(), ValueVT, IntermediateVT, NumIntermediates,
+          RegisterVT);
+    } else {
+      NumRegs =
+          TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
+                                     NumIntermediates, RegisterVT);
+    }
+
     assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
     NumParts = NumRegs; // Silence a compiler warning.
     assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
@@ -314,9 +326,14 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
 
     // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
     // intermediate operands.
+    EVT BuiltVectorTy =
+        EVT::getVectorVT(*DAG.getContext(), IntermediateVT.getScalarType(),
+                         (IntermediateVT.isVector()
+                              ? IntermediateVT.getVectorNumElements() * NumParts
+                              : NumIntermediates));
     Val = DAG.getNode(IntermediateVT.isVector() ? ISD::CONCAT_VECTORS
                                                 : ISD::BUILD_VECTOR,
-                      DL, ValueVT, Ops);
+                      DL, BuiltVectorTy, Ops);
   }
 
   // There is now one part, held in Val.  Correct it to match ValueVT.
@@ -355,13 +372,30 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
       TLI.isTypeLegal(ValueVT))
     return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
 
-  // Handle cases such as i8 -> <1 x i1>
   if (ValueVT.getVectorNumElements() != 1) {
-    diagnosePossiblyInvalidConstraint(*DAG.getContext(), V,
-                                      "non-trivial scalar-to-vector conversion");
+
+    // Certain ABIs require that vectors are passed as integers. For vectors
+    // are the same size, this is an obvious bitcast.
+    if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits()) {
+      return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+    } else if (ValueVT.getSizeInBits() < PartEVT.getSizeInBits()) {
+      // Bitcast Val back the original type and extract the corresponding
+      // vector we want.
+      unsigned Elts = PartEVT.getSizeInBits() / ValueVT.getScalarSizeInBits();
+      EVT WiderVecType = EVT::getVectorVT(*DAG.getContext(),
+                                          ValueVT.getVectorElementType(), Elts);
+      Val = DAG.getBitcast(WiderVecType, Val);
+      return DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
+          DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    }
+
+    diagnosePossiblyInvalidConstraint(
+        *DAG.getContext(), V, "non-trivial scalar-to-vector conversion");
     return DAG.getUNDEF(ValueVT);
   }
 
+  // Handle cases such as i8 -> <1 x i1>
   if (ValueVT.getVectorNumElements() == 1 &&
       ValueVT.getVectorElementType() != PartEVT)
     Val = DAG.getAnyExtOrTrunc(Val, DL, ValueVT.getScalarType());
@@ -371,7 +405,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
 
 static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &dl,
                                  SDValue Val, SDValue *Parts, unsigned NumParts,
-                                 MVT PartVT, const Value *V);
+                                 MVT PartVT, const Value *V, bool IsABIRegCopy);
 
 /// getCopyToParts - Create a series of nodes that contain the specified value
 /// split into legal parts.  If the parts contain more bits than Val, then, for
@@ -379,12 +413,14 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &dl,
 static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
                            SDValue *Parts, unsigned NumParts, MVT PartVT,
                            const Value *V,
-                           ISD::NodeType ExtendKind = ISD::ANY_EXTEND) {
+                           ISD::NodeType ExtendKind = ISD::ANY_EXTEND,
+                           bool IsABIRegCopy = false) {
   EVT ValueVT = Val.getValueType();
 
   // Handle the vector case separately.
   if (ValueVT.isVector())
-    return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V);
+    return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V,
+                                IsABIRegCopy);
 
   unsigned PartBits = PartVT.getSizeInBits();
   unsigned OrigNumParts = NumParts;
@@ -509,7 +545,9 @@ static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
 /// value split into legal parts.
 static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
                                  SDValue Val, SDValue *Parts, unsigned NumParts,
-                                 MVT PartVT, const Value *V) {
+                                 MVT PartVT, const Value *V,
+                                 bool IsABIRegCopy) {
+
   EVT ValueVT = Val.getValueType();
   assert(ValueVT.isVector() && "Not a vector");
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -550,15 +588,22 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
 
       // Promoted vector extract
       Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
-    } else{
+    } else {
       // Vector -> scalar conversion.
-      assert(ValueVT.getVectorNumElements() == 1 &&
-             "Only trivial vector-to-scalar conversions should get here!");
-      Val = DAG.getNode(
-          ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
-          DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      if (ValueVT.getVectorNumElements() == 1) {
+        Val = DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
+            DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
 
-      Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
+        Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
+      } else {
+        assert(PartVT.getSizeInBits() > ValueVT.getSizeInBits() &&
+               "lossy conversion of vector to scalar type");
+        EVT IntermediateType = EVT::getIntegerVT(*DAG.getContext(),
+                                                 ValueVT.getSizeInBits());
+        Val = DAG.getBitcast(IntermediateType, Val);
+        Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
+      }
     }
 
     Parts[0] = Val;
@@ -569,15 +614,31 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
   EVT IntermediateVT;
   MVT RegisterVT;
   unsigned NumIntermediates;
-  unsigned NumRegs = TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT,
-                                                IntermediateVT,
-                                                NumIntermediates, RegisterVT);
+  unsigned NumRegs;
+  if (IsABIRegCopy) {
+    NumRegs = TLI.getVectorTypeBreakdownForCallingConv(
+        *DAG.getContext(), ValueVT, IntermediateVT, NumIntermediates,
+        RegisterVT);
+  } else {
+    NumRegs =
+        TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
+                                   NumIntermediates, RegisterVT);
+  }
   unsigned NumElements = ValueVT.getVectorNumElements();
 
   assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
   NumParts = NumRegs; // Silence a compiler warning.
   assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
 
+  // Convert the vector to the appropiate type if necessary.
+  unsigned DestVectorNoElts =
+      NumIntermediates *
+      (IntermediateVT.isVector() ? IntermediateVT.getVectorNumElements() : 1);
+  EVT BuiltVectorTy = EVT::getVectorVT(
+      *DAG.getContext(), IntermediateVT.getScalarType(), DestVectorNoElts);
+  if (Val.getValueType() != BuiltVectorTy)
+    Val = DAG.getNode(ISD::BITCAST, DL, BuiltVectorTy, Val);
+
   // Split the vector into intermediate operands.
   SmallVector<SDValue, 8> Ops(NumIntermediates);
   for (unsigned i = 0; i != NumIntermediates; ++i) {
@@ -610,22 +671,31 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
   }
 }
 
-RegsForValue::RegsForValue() {}
+RegsForValue::RegsForValue() { IsABIMangled = false; }
 
 RegsForValue::RegsForValue(const SmallVector<unsigned, 4> &regs, MVT regvt,
-                           EVT valuevt)
-    : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs) {}
+                           EVT valuevt, bool IsABIMangledValue)
+    : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs),
+      RegCount(1, regs.size()), IsABIMangled(IsABIMangledValue) {}
 
 RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
-                           const DataLayout &DL, unsigned Reg, Type *Ty) {
+                           const DataLayout &DL, unsigned Reg, Type *Ty,
+                           bool IsABIMangledValue) {
   ComputeValueVTs(TLI, DL, Ty, ValueVTs);
 
+  IsABIMangled = IsABIMangledValue;
+
   for (EVT ValueVT : ValueVTs) {
-    unsigned NumRegs = TLI.getNumRegisters(Context, ValueVT);
-    MVT RegisterVT = TLI.getRegisterType(Context, ValueVT);
+    unsigned NumRegs = IsABIMangledValue
+                           ? TLI.getNumRegistersForCallingConv(Context, ValueVT)
+                           : TLI.getNumRegisters(Context, ValueVT);
+    MVT RegisterVT = IsABIMangledValue
+                         ? TLI.getRegisterTypeForCallingConv(Context, ValueVT)
+                         : TLI.getRegisterType(Context, ValueVT);
     for (unsigned i = 0; i != NumRegs; ++i)
       Regs.push_back(Reg + i);
     RegVTs.push_back(RegisterVT);
+    RegCount.push_back(NumRegs);
     Reg += NumRegs;
   }
 }
@@ -646,8 +716,10 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
   for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
     // Copy the legal parts from the registers.
     EVT ValueVT = ValueVTs[Value];
-    unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVT);
-    MVT RegisterVT = RegVTs[Value];
+    unsigned NumRegs = RegCount[Value];
+    MVT RegisterVT = IsABIMangled
+                         ? TLI.getRegisterTypeForCallingConv(RegVTs[Value])
+                         : RegVTs[Value];
 
     Parts.resize(NumRegs);
     for (unsigned i = 0; i != NumRegs; ++i) {
@@ -742,9 +814,11 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
   unsigned NumRegs = Regs.size();
   SmallVector<SDValue, 8> Parts(NumRegs);
   for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
-    EVT ValueVT = ValueVTs[Value];
-    unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), ValueVT);
-    MVT RegisterVT = RegVTs[Value];
+    unsigned NumParts = RegCount[Value];
+
+    MVT RegisterVT = IsABIMangled
+                         ? TLI.getRegisterTypeForCallingConv(RegVTs[Value])
+                         : RegVTs[Value];
 
     if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT))
       ExtendKind = ISD::ZERO_EXTEND;
@@ -967,10 +1041,16 @@ SDValue SelectionDAGBuilder::getCopyFromRegs(const Value *V, Type *Ty) {
 
   if (It != FuncInfo.ValueMap.end()) {
     unsigned InReg = It->second;
+    bool IsABIRegCopy =
+        V && ((isa<CallInst>(V) &&
+               !(static_cast<const CallInst *>(V))->isInlineAsm()) ||
+              isa<ReturnInst>(V));
+
     RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
-                     DAG.getDataLayout(), InReg, Ty);
+                     DAG.getDataLayout(), InReg, Ty, IsABIRegCopy);
     SDValue Chain = DAG.getEntryNode();
-    Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
+    Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr,
+                                 V);
     resolveDanglingDebugInfo(V, Result);
   }
 
@@ -1157,8 +1237,13 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
   // If this is an instruction which fast-isel has deferred, select it now.
   if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
     unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
+    bool IsABIRegCopy =
+        V && ((isa<CallInst>(V) &&
+               !(static_cast<const CallInst *>(V))->isInlineAsm()) ||
+              isa<ReturnInst>(V));
+
     RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg,
-                     Inst->getType());
+                     Inst->getType(), IsABIRegCopy);
     SDValue Chain = DAG.getEntryNode();
     return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
   }
@@ -1386,12 +1471,12 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
         if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
           VT = TLI.getTypeForExtReturn(Context, VT, ExtendKind);
 
-        unsigned NumParts = TLI.getNumRegisters(Context, VT);
-        MVT PartVT = TLI.getRegisterType(Context, VT);
+        unsigned NumParts = TLI.getNumRegistersForCallingConv(Context, VT);
+        MVT PartVT = TLI.getRegisterTypeForCallingConv(Context, VT);
         SmallVector<SDValue, 4> Parts(NumParts);
         getCopyToParts(DAG, getCurSDLoc(),
                        SDValue(RetOp.getNode(), RetOp.getResNo() + j),
-                       &Parts[0], NumParts, PartVT, &I, ExtendKind);
+                       &Parts[0], NumParts, PartVT, &I, ExtendKind, true);
 
         // 'inreg' on function refers to return value
         ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
@@ -7064,8 +7149,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
           SDLoc dl = getCurSDLoc();
           // Use the produced MatchedRegs object to
-          MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl,
-                                    Chain, &Flag, CS.getInstruction());
+          MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Flag,
+                                    CS.getInstruction());
           MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse,
                                            true, OpInfo.getMatchedOperand(), dl,
                                            DAG, AsmNodeOperands);
@@ -7681,8 +7766,10 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   } else {
     for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
       EVT VT = RetTys[I];
-      MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
-      unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
+      MVT RegisterVT =
+          getRegisterTypeForCallingConv(CLI.RetTy->getContext(), VT);
+      unsigned NumRegs =
+          getNumRegistersForCallingConv(CLI.RetTy->getContext(), VT);
       for (unsigned i = 0; i != NumRegs; ++i) {
         ISD::InputArg MyFlags;
         MyFlags.VT = RegisterVT;
@@ -7731,7 +7818,11 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       SDValue Op = SDValue(Args[i].Node.getNode(),
                            Args[i].Node.getResNo() + Value);
       ISD::ArgFlagsTy Flags;
-      unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
+
+      // Certain targets (such as MIPS), may have a different ABI alignment
+      // for a type depending on the context. Give the target a chance to
+      // specify the alignment it wants.
+      unsigned OriginalAlignment = getABIAlignmentForCallingConv(ArgTy, DL);
 
       if (Args[i].IsZExt)
         Flags.setZExt();
@@ -7786,8 +7877,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         Flags.setInConsecutiveRegs();
       Flags.setOrigAlign(OriginalAlignment);
 
-      MVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT);
-      unsigned NumParts = getNumRegisters(CLI.RetTy->getContext(), VT);
+      MVT PartVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(), VT);
+      unsigned NumParts =
+          getNumRegistersForCallingConv(CLI.RetTy->getContext(), VT);
       SmallVector<SDValue, 4> Parts(NumParts);
       ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
@@ -7817,7 +7909,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       }
 
       getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT,
-                     CLI.CS ? CLI.CS->getInstruction() : nullptr, ExtendKind);
+                     CLI.CS ? CLI.CS->getInstruction() : nullptr, ExtendKind,
+                     true);
 
       for (unsigned j = 0; j != NumParts; ++j) {
         // if it isn't first piece, alignment must be 1
@@ -7917,12 +8010,14 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     unsigned CurReg = 0;
     for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
       EVT VT = RetTys[I];
-      MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
-      unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
+      MVT RegisterVT =
+          getRegisterTypeForCallingConv(CLI.RetTy->getContext(), VT);
+      unsigned NumRegs =
+          getNumRegistersForCallingConv(CLI.RetTy->getContext(), VT);
 
       ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
                                               NumRegs, RegisterVT, VT, nullptr,
-                                              AssertOp));
+                                              AssertOp, true));
       CurReg += NumRegs;
     }
 
@@ -7958,8 +8053,15 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
   assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg");
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  // If this is an InlineAsm we have to match the registers required, not the
+  // notional registers required by the type.
+  bool IsABIRegCopy =
+    V && ((isa<CallInst>(V) &&
+           !(static_cast<const CallInst *>(V))->isInlineAsm()) ||
+          isa<ReturnInst>(V));
+
   RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg,
-                   V->getType());
+                   V->getType(), IsABIRegCopy);
   SDValue Chain = DAG.getEntryNode();
 
   ISD::NodeType ExtendType = (FuncInfo.PreferredExtendType.find(V) ==
@@ -8202,7 +8304,12 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       EVT VT = ValueVTs[Value];
       Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
       ISD::ArgFlagsTy Flags;
-      unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
+
+      // Certain targets (such as MIPS), may have a different ABI alignment
+      // for a type depending on the context. Give the target a chance to
+      // specify the alignment it wants.
+      unsigned OriginalAlignment =
+          TLI->getABIAlignmentForCallingConv(ArgTy, DL);
 
       if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt))
         Flags.setZExt();
@@ -8264,8 +8371,10 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       if (ArgCopyElisionCandidates.count(&Arg))
         Flags.setCopyElisionCandidate();
 
-      MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
-      unsigned NumRegs = TLI->getNumRegisters(*CurDAG->getContext(), VT);
+      MVT RegisterVT =
+          TLI->getRegisterTypeForCallingConv(*CurDAG->getContext(), VT);
+      unsigned NumRegs =
+          TLI->getNumRegistersForCallingConv(*CurDAG->getContext(), VT);
       for (unsigned i = 0; i != NumRegs; ++i) {
         ISD::InputArg MyFlags(Flags, RegisterVT, VT, isArgValueUsed,
                               Idx-1, PartBase+i*RegisterVT.getStoreSize());
@@ -8372,8 +8481,10 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
 
     for (unsigned Val = 0; Val != NumValues; ++Val) {
       EVT VT = ValueVTs[Val];
-      MVT PartVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
-      unsigned NumParts = TLI->getNumRegisters(*CurDAG->getContext(), VT);
+      MVT PartVT =
+          TLI->getRegisterTypeForCallingConv(*CurDAG->getContext(), VT);
+      unsigned NumParts =
+          TLI->getNumRegistersForCallingConv(*CurDAG->getContext(), VT);
 
       // Even an apparant 'unused' swifterror argument needs to be returned. So
       // we do generate a copy for it that can be used on return from the
@@ -8386,7 +8497,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
           AssertOp = ISD::AssertZext;
 
         ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts,
-                                             PartVT, VT, nullptr, AssertOp));
+                                             PartVT, VT, nullptr, AssertOp,
+                                             true));
       }
 
       i += NumParts;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index c6acc09b660..b24a513f3c0 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -973,18 +973,28 @@ struct RegsForValue {
   /// expanded value requires multiple registers.
   SmallVector<unsigned, 4> Regs;
 
+  /// This list holds the number of registers for each value.
+  SmallVector<unsigned, 4> RegCount;
+
+  /// Records if this value needs to be treated in an ABI dependant manner,
+  /// different to normal type legalization.
+  bool IsABIMangled;
+
   RegsForValue();
 
-  RegsForValue(const SmallVector<unsigned, 4> &regs, MVT regvt, EVT valuevt);
+  RegsForValue(const SmallVector<unsigned, 4> &regs, MVT regvt, EVT valuevt,
+               bool IsABIMangledValue = false);
 
   RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
-               const DataLayout &DL, unsigned Reg, Type *Ty);
+               const DataLayout &DL, unsigned Reg, Type *Ty,
+               bool IsABIMangledValue = false);
 
   /// Add the specified values to this one.
   void append(const RegsForValue &RHS) {
     ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end());
     RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end());
     Regs.append(RHS.Regs.begin(), RHS.Regs.end());
+    RegCount.push_back(RHS.Regs.size());
   }
 
   /// Emit a series of CopyFromReg nodes that copies from this value and returns
diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index d27e2455978..1e2dc3a9f23 100644
--- a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -835,7 +835,7 @@ SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP,
       //       completely and make statepoint call to return a tuple.
       unsigned Reg = FuncInfo.CreateRegs(RetTy);
       RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
-                       DAG.getDataLayout(), Reg, RetTy);
+                       DAG.getDataLayout(), Reg, RetTy, true);
       SDValue Chain = DAG.getEntryNode();
 
       RFV.getCopyToRegs(ReturnValue, DAG, getCurSDLoc(), Chain, nullptr);
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index fc147633966..c9ecd8ae0f9 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -1616,8 +1616,10 @@ void llvm::GetReturnInfo(Type *ReturnType, AttributeList attr,
         VT = MinVT;
     }
 
-    unsigned NumParts = TLI.getNumRegisters(ReturnType->getContext(), VT);
-    MVT PartVT = TLI.getRegisterType(ReturnType->getContext(), VT);
+    unsigned NumParts =
+        TLI.getNumRegistersForCallingConv(ReturnType->getContext(), VT);
+    MVT PartVT =
+        TLI.getRegisterTypeForCallingConv(ReturnType->getContext(), VT);
 
     // 'inreg' on function refers to return value
     ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
diff --git a/lib/Target/Mips/MipsCCState.cpp b/lib/Target/Mips/MipsCCState.cpp
index 7af988c1f64..62ff99c7816 100644
--- a/lib/Target/Mips/MipsCCState.cpp
+++ b/lib/Target/Mips/MipsCCState.cpp
@@ -54,6 +54,22 @@ static bool originalTypeIsF128(Type *Ty, const SDNode *CallNode) {
   return (ES && Ty->isIntegerTy(128) && isF128SoftLibCall(ES->getSymbol()));
 }
 
+/// Return true if the original type was vXfXX.
+static bool originalEVTTypeIsVectorFloat(EVT Ty) {
+  if (Ty.isVector() && Ty.getVectorElementType().isFloatingPoint())
+    return true;
+
+  return false;
+}
+
+/// Return true if the original type was vXfXX / vXfXX.
+static bool originalTypeIsVectorFloat(Type * Ty) {
+  if (Ty->isVectorTy() && Ty->isFPOrFPVectorTy())
+    return true;
+
+  return false;
+}
+
 MipsCCState::SpecialCallingConvType
 MipsCCState::getSpecialCallingConvForCallee(const SDNode *Callee,
                                             const MipsSubtarget &Subtarget) {
@@ -81,8 +97,8 @@ void MipsCCState::PreAnalyzeCallResultForF128(
   }
 }
 
-/// Identify lowered values that originated from f128 arguments and record
-/// this for use by RetCC_MipsN.
+/// Identify lowered values that originated from f128 or float arguments and
+/// record this for use by RetCC_MipsN.
 void MipsCCState::PreAnalyzeReturnForF128(
     const SmallVectorImpl<ISD::OutputArg> &Outs) {
   const MachineFunction &MF = getMachineFunction();
@@ -94,26 +110,50 @@ void MipsCCState::PreAnalyzeReturnForF128(
   }
 }
 
-/// Identify lowered values that originated from f128 arguments and record
+/// Identify lower values that originated from vXfXX and record
+/// this.
+void MipsCCState::PreAnalyzeCallResultForVectorFloat(
+    const SmallVectorImpl<ISD::InputArg> &Ins,
+    const TargetLowering::CallLoweringInfo &CLI) {
+  for (unsigned i = 0; i < Ins.size(); ++i) {
+    OriginalRetWasFloatVector.push_back(
+        originalTypeIsVectorFloat(CLI.RetTy));
+  }
+}
+
+/// Identify lowered values that originated from vXfXX arguments and record
 /// this.
+void MipsCCState::PreAnalyzeReturnForVectorFloat(
+    const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  for (unsigned i = 0; i < Outs.size(); ++i) {
+    ISD::OutputArg Out = Outs[i];
+    OriginalRetWasFloatVector.push_back(
+        originalEVTTypeIsVectorFloat(Out.ArgVT));
+  }
+}
+/// Identify lowered values that originated from f128, float and sret to vXfXX
+/// arguments and record this.
 void MipsCCState::PreAnalyzeCallOperands(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     std::vector<TargetLowering::ArgListEntry> &FuncArgs,
     const SDNode *CallNode) {
   for (unsigned i = 0; i < Outs.size(); ++i) {
-    OriginalArgWasF128.push_back(
-        originalTypeIsF128(FuncArgs[Outs[i].OrigArgIndex].Ty, CallNode));
-    OriginalArgWasFloat.push_back(
-        FuncArgs[Outs[i].OrigArgIndex].Ty->isFloatingPointTy());
+    TargetLowering::ArgListEntry FuncArg = FuncArgs[Outs[i].OrigArgIndex];
+
+    OriginalArgWasF128.push_back(originalTypeIsF128(FuncArg.Ty, CallNode));
+    OriginalArgWasFloat.push_back(FuncArg.Ty->isFloatingPointTy());
+
+    OriginalArgWasFloatVector.push_back(FuncArg.Ty->isVectorTy());
     CallOperandIsFixed.push_back(Outs[i].IsFixed);
   }
 }
 
-/// Identify lowered values that originated from f128 arguments and record
-/// this.
+/// Identify lowered values that originated from f128, float and vXfXX arguments
+/// and record this.
 void MipsCCState::PreAnalyzeFormalArgumentsForF128(
     const SmallVectorImpl<ISD::InputArg> &Ins) {
   const MachineFunction &MF = getMachineFunction();
+
   for (unsigned i = 0; i < Ins.size(); ++i) {
     Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
 
@@ -123,6 +163,7 @@ void MipsCCState::PreAnalyzeFormalArgumentsForF128(
     if (Ins[i].Flags.isSRet()) {
       OriginalArgWasF128.push_back(false);
       OriginalArgWasFloat.push_back(false);
+      OriginalArgWasFloatVector.push_back(false);
       continue;
     }
 
@@ -132,5 +173,10 @@ void MipsCCState::PreAnalyzeFormalArgumentsForF128(
     OriginalArgWasF128.push_back(
         originalTypeIsF128(FuncArg->getType(), nullptr));
     OriginalArgWasFloat.push_back(FuncArg->getType()->isFloatingPointTy());
+
+    // The MIPS vector ABI exhibits a corner case of sorts or quirk; if the
+    // first argument is actually an SRet pointer to a vector, then the next
+    // argument slot is $a2.
+    OriginalArgWasFloatVector.push_back(FuncArg->getType()->isVectorTy());
   }
 }
diff --git a/lib/Target/Mips/MipsCCState.h b/lib/Target/Mips/MipsCCState.h
index 081c393a09b..d86bb85126b 100644
--- a/lib/Target/Mips/MipsCCState.h
+++ b/lib/Target/Mips/MipsCCState.h
@@ -45,16 +45,33 @@ private:
                          const SDNode *CallNode);
 
   /// Identify lowered values that originated from f128 arguments and record
-  /// this.
+  /// this for use by RetCC_MipsN.
   void
   PreAnalyzeFormalArgumentsForF128(const SmallVectorImpl<ISD::InputArg> &Ins);
 
+  void PreAnalyzeCallResultForVectorFloat(
+      const SmallVectorImpl<ISD::InputArg> &Ins,
+      const TargetLowering::CallLoweringInfo &CLI);
+
+  void PreAnalyzeFormalArgumentsForVectorFloat(
+      const SmallVectorImpl<ISD::InputArg> &Ins);
+
+  void
+  PreAnalyzeReturnForVectorFloat(const SmallVectorImpl<ISD::OutputArg> &Outs);
+
   /// Records whether the value has been lowered from an f128.
   SmallVector<bool, 4> OriginalArgWasF128;
 
   /// Records whether the value has been lowered from float.
   SmallVector<bool, 4> OriginalArgWasFloat;
 
+  /// Records whether the value has been lowered from a floating point vector.
+  SmallVector<bool, 4> OriginalArgWasFloatVector;
+
+  /// Records whether the return value has been lowered from a floating point
+  /// vector.
+  SmallVector<bool, 4> OriginalRetWasFloatVector;
+
   /// Records whether the value was a fixed argument.
   /// See ISD::OutputArg::IsFixed,
   SmallVector<bool, 4> CallOperandIsFixed;
@@ -78,6 +95,7 @@ public:
     CCState::AnalyzeCallOperands(Outs, Fn);
     OriginalArgWasF128.clear();
     OriginalArgWasFloat.clear();
+    OriginalArgWasFloatVector.clear();
     CallOperandIsFixed.clear();
   }
 
@@ -96,31 +114,38 @@ public:
     CCState::AnalyzeFormalArguments(Ins, Fn);
     OriginalArgWasFloat.clear();
     OriginalArgWasF128.clear();
+    OriginalArgWasFloatVector.clear();
   }
 
   void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
                          CCAssignFn Fn,
                          const TargetLowering::CallLoweringInfo &CLI) {
     PreAnalyzeCallResultForF128(Ins, CLI);
+    PreAnalyzeCallResultForVectorFloat(Ins, CLI);
     CCState::AnalyzeCallResult(Ins, Fn);
     OriginalArgWasFloat.clear();
     OriginalArgWasF128.clear();
+    OriginalArgWasFloatVector.clear();
   }
 
   void AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
                      CCAssignFn Fn) {
     PreAnalyzeReturnForF128(Outs);
+    PreAnalyzeReturnForVectorFloat(Outs);
     CCState::AnalyzeReturn(Outs, Fn);
     OriginalArgWasFloat.clear();
     OriginalArgWasF128.clear();
+    OriginalArgWasFloatVector.clear();
   }
 
   bool CheckReturn(const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
                    CCAssignFn Fn) {
     PreAnalyzeReturnForF128(ArgsFlags);
+    PreAnalyzeReturnForVectorFloat(ArgsFlags);
     bool Return = CCState::CheckReturn(ArgsFlags, Fn);
     OriginalArgWasFloat.clear();
     OriginalArgWasF128.clear();
+    OriginalArgWasFloatVector.clear();
     return Return;
   }
 
@@ -128,6 +153,13 @@ public:
   bool WasOriginalArgFloat(unsigned ValNo) {
       return OriginalArgWasFloat[ValNo];
   }
+  bool WasOriginalArgVectorFloat(unsigned ValNo) const {
+    return OriginalArgWasFloatVector[ValNo];
+  }
+  bool WasOriginalRetVectorFloat(unsigned ValNo) const {
+    return OriginalRetWasFloatVector[ValNo];
+  }
+
   bool IsCallOperandFixed(unsigned ValNo) { return CallOperandIsFixed[ValNo]; }
   SpecialCallingConvType getSpecialCallingConv() { return SpecialCallingConv; }
 };
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index a57cb7badc1..b5df78f89a6 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -37,6 +37,10 @@ class CCIfOrigArgWasF128<CCAction A>
 class CCIfArgIsVarArg<CCAction A>
     : CCIf<"!static_cast<MipsCCState *>(&State)->IsCallOperandFixed(ValNo)", A>;
 
+/// Match if the return was a floating point vector.
+class CCIfOrigArgWasNotVectorFloat<CCAction A>
+    : CCIf<"!static_cast<MipsCCState *>(&State)"
+                "->WasOriginalRetVectorFloat(ValNo)", A>;
 
 /// Match if the special calling conv is the specified value.
 class CCIfSpecialCallingConv<string CC, CCAction A>
@@ -93,8 +97,10 @@ def RetCC_MipsO32 : CallingConv<[
   // Promote i1/i8/i16 return values to i32.
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
 
-  // i32 are returned in registers V0, V1, A0, A1
-  CCIfType<[i32], CCAssignToReg<[V0, V1, A0, A1]>>,
+  // i32 are returned in registers V0, V1, A0, A1, unless the original return
+  // type was a vector of floats.
+  CCIfOrigArgWasNotVectorFloat<CCIfType<[i32],
+                                        CCAssignToReg<[V0, V1, A0, A1]>>>,
 
   // f32 are returned in registers F0, F2
   CCIfType<[f32], CCAssignToReg<[F0, F2]>>,
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 93c5f496ce9..a726e25f0b0 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -71,6 +71,48 @@ static bool isShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
   return true;
 }
 
+// The MIPS MSA ABI passes vector arguments in the integer register set.
+// The number of integer registers used is dependant on the ABI used.
+MVT MipsTargetLowering::getRegisterTypeForCallingConv(MVT VT) const {
+  if (VT.isVector() && Subtarget.hasMSA())
+    return Subtarget.isABI_O32() ? MVT::i32 : MVT::i64;
+  return MipsTargetLowering::getRegisterType(VT);
+}
+
+MVT MipsTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+                                                      EVT VT) const {
+  if (VT.isVector()) {
+      if (Subtarget.isABI_O32()) {
+        return MVT::i32;
+      } else {
+        return (VT.getSizeInBits() == 32) ? MVT::i32 : MVT::i64;
+      }
+  }
+  return MipsTargetLowering::getRegisterType(Context, VT);
+}
+
+unsigned MipsTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
+                                                           EVT VT) const {
+  if (VT.isVector())
+    return std::max((VT.getSizeInBits() / (Subtarget.isABI_O32() ? 32 : 64)),
+                    1U);
+  return MipsTargetLowering::getNumRegisters(Context, VT);
+}
+
+unsigned MipsTargetLowering::getVectorTypeBreakdownForCallingConv(
+    LLVMContext &Context, EVT VT, EVT &IntermediateVT,
+    unsigned &NumIntermediates, MVT &RegisterVT) const {
+
+  // Break down vector types to either 2 i64s or 4 i32s.
+  RegisterVT = getRegisterTypeForCallingConv(Context, VT) ;
+  IntermediateVT = RegisterVT;
+  NumIntermediates = VT.getSizeInBits() < RegisterVT.getSizeInBits()
+                         ? VT.getVectorNumElements()
+                         : VT.getSizeInBits() / RegisterVT.getSizeInBits();
+
+  return NumIntermediates;
+}
+
 SDValue MipsTargetLowering::getGlobalReg(SelectionDAG &DAG, EVT Ty) const {
   MipsFunctionInfo *FI = DAG.getMachineFunction().getInfo<MipsFunctionInfo>();
   return DAG.getRegister(FI->getGlobalBaseReg(), Ty);
@@ -2515,6 +2557,11 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
 //       yet to hold an argument. Otherwise, use A2, A3 and stack. If A1 is
 //       not used, it must be shadowed. If only A3 is available, shadow it and
 //       go to stack.
+// vXiX - Received as scalarized i32s, passed in A0 - A3 and the stack.
+// vXf32 - Passed in either a pair of registers {A0, A1}, {A2, A3} or {A0 - A3}
+//         with the remainder spilled to the stack.
+// vXf64 - Passed in either {A0, A1, A2, A3} or {A2, A3} and in both cases
+//         spilling the remainder to the stack.
 //
 //  For vararg functions, all arguments are passed in A0, A1, A2, A3 and stack.
 //===----------------------------------------------------------------------===//
@@ -2526,8 +2573,13 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
       State.getMachineFunction().getSubtarget());
 
   static const MCPhysReg IntRegs[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 };
+
+  const MipsCCState * MipsState = static_cast<MipsCCState *>(&State);
+
   static const MCPhysReg F32Regs[] = { Mips::F12, Mips::F14 };
 
+  static const MCPhysReg FloatVectorIntRegs[] = { Mips::A0, Mips::A2 };
+
   // Do not process byval args here.
   if (ArgFlags.isByVal())
     return true;
@@ -2565,8 +2617,26 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
                                 State.getFirstUnallocated(F32Regs) != ValNo;
   unsigned OrigAlign = ArgFlags.getOrigAlign();
   bool isI64 = (ValVT == MVT::i32 && OrigAlign == 8);
-
-  if (ValVT == MVT::i32 || (ValVT == MVT::f32 && AllocateFloatsInIntReg)) {
+  bool isVectorFloat = MipsState->WasOriginalArgVectorFloat(ValNo);
+
+  // The MIPS vector ABI for floats passes them in a pair of registers
+  if (ValVT == MVT::i32 && isVectorFloat) {
+    // This is the start of an vector that was scalarized into an unknown number
+    // of components. It doesn't matter how many there are. Allocate one of the
+    // notional 8 byte aligned registers which map onto the argument stack, and
+    // shadow the register lost to alignment requirements.
+    if (ArgFlags.isSplit()) {
+      Reg = State.AllocateReg(FloatVectorIntRegs);
+      if (Reg == Mips::A2)
+        State.AllocateReg(Mips::A1);
+      else if (Reg == 0)
+        State.AllocateReg(Mips::A3);
+    } else {
+      // If we're an intermediate component of the split, we can just attempt to
+      // allocate a register directly.
+      Reg = State.AllocateReg(IntRegs);
+    }
+  } else if (ValVT == MVT::i32 || (ValVT == MVT::f32 && AllocateFloatsInIntReg)) {
     Reg = State.AllocateReg(IntRegs);
     // If this is the first part of an i64 arg,
     // the allocated register must be either A0 or A2.
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 2dcafd51061..0e47ed38f42 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -248,6 +248,33 @@ namespace llvm {
     bool isCheapToSpeculateCttz() const override;
     bool isCheapToSpeculateCtlz() const override;
 
+    /// Return the register type for a given MVT, ensuring vectors are treated
+    /// as a series of gpr sized integers.
+    virtual MVT getRegisterTypeForCallingConv(MVT VT) const override;
+
+    /// Return the register type for a given MVT, ensuring vectors are treated
+    /// as a series of gpr sized integers.
+    virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context,
+                                              EVT VT) const override;
+
+    /// Return the number of registers for a given MVT, ensuring vectors are
+    /// treated as a series of gpr sized integers.
+    virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+                                                   EVT VT) const override;
+
+    /// Break down vectors to the correct number of gpr sized integers.
+    virtual unsigned getVectorTypeBreakdownForCallingConv(
+        LLVMContext &Context, EVT VT, EVT &IntermediateVT,
+        unsigned &NumIntermediates, MVT &RegisterVT) const override;
+
+    /// Return the correct alignment for the current calling convention.
+    virtual unsigned
+    getABIAlignmentForCallingConv(Type *ArgTy, DataLayout DL) const override {
+      if (ArgTy->isVectorTy())
+        return std::min(DL.getABITypeAlignment(ArgTy), 8U);
+      return DL.getABITypeAlignment(ArgTy);
+    }
+
     ISD::NodeType getExtendForAtomicOps() const override {
       return ISD::SIGN_EXTEND;
     }
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 65be350f259..625c80b9d68 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -283,10 +283,12 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   uint64_t stackSize = MF.getFrameInfo().getStackSize();
   int64_t spOffset = MF.getFrameInfo().getObjectOffset(FrameIndex);
+  unsigned alignment = MF.getFrameInfo().getObjectAlignment(FrameIndex);
 
   DEBUG(errs() << "FrameIndex : " << FrameIndex << "\n"
                << "spOffset   : " << spOffset << "\n"
-               << "stackSize  : " << stackSize << "\n");
+               << "stackSize  : " << stackSize << "\n"
+               << "alignment  : " << alignment << "\n");
 
   eliminateFI(MI, FIOperandNum, FrameIndex, stackSize, spOffset);
 }
diff --git a/test/CodeGen/Mips/cconv/vector.ll b/test/CodeGen/Mips/cconv/vector.ll
new file mode 100644
index 00000000000..5a88d064fe7
--- /dev/null
+++ b/test/CodeGen/Mips/cconv/vector.ll
@@ -0,0 +1,1657 @@
+; RUN: llc < %s -march=mips -mcpu=mips32 -disable-mips-delay-filler | FileCheck %s --check-prefixes=ALL,MIPS32,MIPS32EB
+; RUN: llc < %s -march=mips64 -relocation-model=pic -mcpu=mips64 -disable-mips-delay-filler | FileCheck %s --check-prefixes=ALL,MIPS64,MIPS64EB
+; RUN: llc < %s -march=mips -mcpu=mips32r5 -mattr=+fp64,+msa -disable-mips-delay-filler | FileCheck %s --check-prefixes=ALL,MIPS32R5,MIPS32R5EB
+; RUN: llc < %s -march=mips64 -relocation-model=pic -mcpu=mips64r5 -mattr=+fp64,+msa -disable-mips-delay-filler | FileCheck %s --check-prefixes=ALL,MIPS64R5
+; RUN: llc < %s -march=mipsel -mcpu=mips32 -disable-mips-delay-filler | FileCheck %s --check-prefixes=ALL,MIPS32,MIPS32EL
+; RUN: llc < %s -march=mips64el -relocation-model=pic -mcpu=mips64 -disable-mips-delay-filler | FileCheck %s --check-prefixes=ALL,MIPS64,MIPS64EL
+; RUN: llc < %s -march=mipsel -mcpu=mips32r5 -mattr=+fp64,+msa -disable-mips-delay-filler | FileCheck %s --check-prefixes=ALL,MIPS32R5,MIPS32R5EL
+; RUN: llc < %s -march=mips64el -relocation-model=pic -mcpu=mips64r5 -mattr=+fp64,+msa -disable-mips-delay-filler | FileCheck %s --check-prefixes=ALL,MIPS64R5
+
+
+
+; Test that vector types are passed through the integer register set whether or
+; not MSA is enabled. This is a ABI requirement for MIPS. For GCC compatibility
+; we need to handle any power of 2 number of elements. We will test this
+; exhaustively for combinations up to MSA register (128 bits) size.
+
+; First set of tests are for argument passing.
+
+define <2 x i8> @i8_2(<2 x i8> %a, <2 x i8> %b) {
+; ALL-LABEL: i8_2:
+; MIPS32EB-DAG: srl ${{[0-9]+}}, $5, 24
+; MIPS32EB-DAG: srl ${{[0-9]+}}, $4, 24
+; MIPS32EB-DAG: srl ${{[0-9]+}}, $5, 16
+; MIPS32EB-DAG: srl ${{[0-9]+}}, $4, 16
+
+; MIPS32EL: addu $1, $4, $5
+
+; MIPS32R5-DAG: sw $4
+; MIPS32R5-DAG: sw $5
+
+; MIPS64EB-DAG: dsrl ${{[0-9]+}}, $5, 56
+; MIPS64EB-DAG: dsrl ${{[0-9]+}}, $4, 56
+; MIPS64EB-DAG: dsrl ${{[0-9]+}}, $5, 48
+; MIPS64EB-DAG: dsrl ${{[0-9]+}}, $4, 48
+
+; MIPS64EL-DAG: sll ${{[0-9]+}}, $4, 0
+; MIPS64EL-DAG: sll ${{[0-9]+}}, $5, 0
+
+; MIPS64R5-DAG: sd $4
+; MIPS64R5-DAG: sd $5
+
+  %1 = add <2 x i8> %a, %b
+  ret <2 x i8> %1
+}
+
+; Test that vector spilled to the outgoing argument area have the expected
+; offset from $sp.
+
+define <2 x i8> @i8x2_7(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d,
+                        <2 x i8> %e, <2 x i8> %f, <2 x i8> %g) {
+entry:
+
+; MIPS32EB-DAG: srl ${{[0-9]+}}, $4, 24
+; MIPS32EB-DAG: srl ${{[0-9]+}}, $5, 24
+; MIPS32EB-DAG: srl ${{[0-9]+}}, $6, 24
+; MIPS32EB-DAG: srl ${{[0-9]+}}, $7, 24
+
+; MIPS32EL-DAG: andi ${{[0-9]+}}, $4, 65280
+; MIPS32EL-DAG: andi ${{[0-9]+}}, $5, 65280
+; MIPS32EL-DAG: andi ${{[0-9]+}}, $6, 65280
+; MIPS32EL-DAG: andi ${{[0-9]+}}, $7, 65280
+
+; MIPS32-DAG: lbu ${{[0-9]+}}, 16($sp)
+; MIPS32-DAG; lbu ${{[0-9]+}}, 17($sp)
+; MIPS32-DAG: lbu ${{[0-9]+}}, 20($sp)
+; MIPS32-DAG: lbu ${{[0-9]+}}, 21($sp)
+; MIPS32-DAG: lbu ${{[0-9]+}}, 24($sp)
+; MIPS32-DAG: lbu ${{[0-9]+}}, 25($sp)
+
+; MIPS32R5-DAG: sw $4, {{[0-9]+}}($sp)
+; MIPS32R5-DAG: sw $5, {{[0-9]+}}($sp)
+; MIPS32R5-DAG: sw $6, {{[0-9]+}}($sp)
+; MIPS32R5-DAG: sw $7, {{[0-9]+}}($sp)
+
+; MIPS32R5-DAG: lbu ${{[0-9]+}}, 40($sp)
+; MIPS32R5-DAG: lbu ${{[0-9]+}}, 41($sp)
+; MIPS32R5-DAG: lbu ${{[0-9]+}}, 42($sp)
+; MIPS32R5-DAG: lbu ${{[0-9]+}}, 43($sp)
+; MIPS32R5-DAG: lbu ${{[0-9]+}}, 44($sp)
+; MIPS32R5-DAG: lbu ${{[0-9]+}}, 45($sp)
+; MIPS32R5-DAG: lbu ${{[0-9]+}}, 46($sp)
+; MIPS32R5-DAG: lbu ${{[0-9]+}}, 47($sp)
+; MIPS32R5-DAG: lbu ${{[0-9]+}}, 48($sp)
+; MIPS32R5-DAG: lbu ${{[0-9]+}}, 49($sp)
+; MIPS32R5-DAG: lbu ${{[0-9]+}}, 50($sp)
+; MIPS32R5-DAG: lbu ${{[0-9]+}}, 51($sp)
+
+; MIPS64EB-DAG: dsrl ${{[0-9]+}}, $4, 48
+; MIPS64EB-DAG: dsrl ${{[0-9]+}}, $5, 48
+; MIPS64EB-DAG: dsrl ${{[0-9]+}}, $6, 48
+; MIPS64EB-DAG: dsrl ${{[0-9]+}}, $7, 48
+; MIPS64EB-DAG: dsrl ${{[0-9]+}}, $8, 48
+; MIPS64EB-DAG: dsrl ${{[0-9]+}}, $9, 48
+; MIPS64EB-DAG: dsrl ${{[0-9]+}}, $10, 48
+
+; MIPS64R5-DAG: sd $4, {{[0-9]+}}($sp)
+; MIPS64R5-DAG: sd $5, {{[0-9]+}}($sp)
+; MIPS64R5-DAG: sd $6, {{[0-9]+}}($sp)
+; MIPS64R5-DAG: sd $7, {{[0-9]+}}($sp)
+; MIPS64R5-DAG: sd $8, {{[0-9]+}}($sp)
+; MIPS64R5-DAG: sd $9, {{[0-9]+}}($sp)
+; MIPS64R5-DAG: sd $10, {{[0-9]+}}($sp)
+
+  %0 = add <2 x i8> %a, %b
+  %1 = add <2 x i8> %0, %c
+  %2 = add <2 x i8> %1, %d
+  %3 = add <2 x i8> %2, %e
+  %4 = add <2 x i8> %3, %f
+  %5 = add <2 x i8> %4, %g
+  ret <2 x i8> %5
+}
+
+define <4 x i8> @i8_4(<4 x i8> %a, <4 x i8> %b) {
+; ALL-LABEL: i8_4:
+; MIPS32-DAG: srl ${{[0-9]+}}, $5, 24
+; MIPS32-DAG: srl ${{[0-9]+}}, $4, 24
+; MIPS32-DAG: srl ${{[0-9]+}}, $5, 16
+; MIPS32-DAG: srl ${{[0-9]+}}, $4, 16
+; MIPS32-DAG: srl ${{[0-9]+}}, $5, 8
+; MIPS32-DAG: srl ${{[0-9]+}}, $4, 8
+
+; MIPS32R5-DAG: sw $4
+; MIPS32R5-DAG: sw $5
+
+; MIPS64-DAG: sll ${{[0-9]+}}, $4, 0
+; MIPS64-DAG: sll ${{[0-9]+}}, $5, 0
+
+; MIPS64R5-DAG: sll ${{[0-9]+}}, $4, 0
+; MIPS64R5-DAG: sll ${{[0-9]+}}, $5, 0
+
+  %1 = add <4 x i8> %a, %b
+  ret <4 x i8> %1
+}
+
+define <8 x i8> @i8_8(<8 x i8> %a, <8 x i8> %b) {
+; ALL-LABEL: i8_8:
+; MIPS32-NOT: lw
+; MIPS32-DAG: srl ${{[0-9]+}}, $7, 24
+; MIPS32-DAG: srl ${{[0-9]+}}, $6, 24
+; MIPS32-DAG: srl ${{[0-9]+}}, $7, 16
+; MIPS32-DAG: srl ${{[0-9]+}}, $6, 16
+; MIPS32-DAG: srl ${{[0-9]+}}, $7, 8
+; MIPS32-DAG: srl ${{[0-9]+}}, $6, 8
+; MIPS32-DAG: srl ${{[0-9]+}}, $5, 24
+; MIPS32-DAG: srl ${{[0-9]+}}, $4, 24
+; MIPS32-DAG: srl ${{[0-9]+}}, $5, 16
+; MIPS32-DAG: srl ${{[0-9]+}}, $4, 16
+; MIPS32-DAG: srl ${{[0-9]+}}, $5, 8
+; MIPS32-DAG: srl ${{[0-9]+}}, $4, 8
+
+; MIPS32R5-DAG: sw $4
+; MIPS32R5-DAG: sw $5
+; MIPS32R5-DAG: sw $6
+; MIPS32R5-DAG: sw $7
+
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $5, 56
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $4, 56
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $5, 48
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $4, 48
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $5, 40
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $4, 40
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $5, 32
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $4, 32
+; MIPS64-DAG: sll $[[R0:[0-9]+]], $4, 0
+; MIPS64-DAG: sll $[[R1:[0-9]+]], $5, 0
+; MIPS64-DAG: srl ${{[0-9]+}}, $[[R1]], 24
+; MIPS64-DAG: srl ${{[0-9]+}}, $[[R0]], 24
+; MIPS64-DAG: srl ${{[0-9]+}}, $[[R1]], 16
+; MIPS64-DAG: srl ${{[0-9]+}}, $[[R0]], 16
+; MIPS64-DAG: srl ${{[0-9]+}}, $[[R1]], 8
+; MIPS64-DAG: srl ${{[0-9]+}}, $[[R0]], 8
+
+; MIPS64R5-DAG: sd $4
+; MIPS64R5-DAG: sd $5
+
+  %1 = add <8 x i8> %a, %b
+  ret <8 x i8> %1
+}
+
+define <16 x i8> @i8_16(<16 x i8> %a, <16 x i8> %b) {
+; ALL-LABEL: i8_16:
+; MIPS32-DAG: lw ${{[0-9]+}}, 16($sp)
+; MIPS32-DAG: lw ${{[0-9]+}}, 20($sp)
+; MIPS32-DAG: lw ${{[0-9]+}}, 24($sp)
+; MIPS32-DAG: lw ${{[0-9]+}}, 28($sp)
+; MIPS32-DAG: srl ${{[0-9]+}}, $7, 24
+; MIPS32-DAG: srl ${{[0-9]+}}, $6, 24
+; MIPS32-DAG: srl ${{[0-9]+}}, $7, 16
+; MIPS32-DAG: srl ${{[0-9]+}}, $6, 16
+; MIPS32-DAG: srl ${{[0-9]+}}, $7, 8
+; MIPS32-DAG: srl ${{[0-9]+}}, $6, 8
+; MIPS32-DAG: srl ${{[0-9]+}}, $5, 24
+; MIPS32-DAG: srl ${{[0-9]+}}, $4, 24
+; MIPS32-DAG: srl ${{[0-9]+}}, $5, 16
+; MIPS32-DAG: srl ${{[0-9]+}}, $4, 16
+; MIPS32-DAG: srl ${{[0-9]+}}, $5, 8
+; MIPS32-DAG: srl ${{[0-9]+}}, $4, 8
+
+; MIPS32R5-DAG: lw ${{[0-9]+}}, 16($sp)
+; MIPS32R5-DAG: lw ${{[0-9]+}}, 20($sp)
+; MIPS32R5-DAG: lw ${{[0-9]+}}, 24($sp)
+; MIPS32R5-DAG: lw ${{[0-9]+}}, 28($sp)
+; MIPS32R5-DAG: insert.w $w[[W0:[0-9]+]][0], $4
+; MIPS32R5-DAG: insert.w $w[[W0]][1], $5
+; MIPS32R5-DAG: insert.w $w[[W0]][2], $6
+; MIPS32R5-DAG: insert.w $w[[W0]][3], $7
+
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $7, 56
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $6, 56
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $7, 48
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $6, 48
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $7, 40
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $6, 40
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $7, 32
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $6, 32
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $5, 56
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $4, 56
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $5, 48
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $4, 48
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $5, 32
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $4, 32
+
+; MIPS64R5-DAG: insert.d $w[[W0:[0-9]+]][{{[0-9]}}], $4
+; MIPS64R5-DAG: insert.d $w[[W0]][{{[0-9]}}], $5
+; MIPS64R5-DAG: insert.d $w[[W1:[0-9]+]][{{[0-9]}}], $6
+; MIPS64R5-DAG: insert.d $w[[W1]][{{[0-9]}}], $7
+
+  %1 = add <16 x i8> %a, %b
+
+  ret <16 x i8> %1
+}
+
+define <2 x i16> @i16_2(<2 x i16> %a, <2 x i16> %b) {
+; ALL-LABEL: i16_2:
+; MIPS32: addu    $[[R0:[0-9]+]], $4, $5
+; MIPS32: andi    $[[R1:[0-9]+]], $[[R0]], 65535
+; MIPS32: srl     $[[R2:[0-9]+]], $5, 16
+; MIPS32: srl     $[[R3:[0-9]+]], $4, 16
+; MIPS32: addu    $[[R4:[0-9]+]], $[[R3]], $[[R2]]
+; MIPS32: sll     $2, $[[R4]], 16
+
+; MIPS32R5-DAG: sw $4
+; MIPS32R5-DAG: sw $5
+
+; MIPS64-DAG: sll ${{[0-9]+}}, $5, 0
+; MIPS64-DAG: sll ${{[0-9]+}}, $4, 0
+
+; MIPS64R5-DAG: sll ${{[0-9]+}}, $4, 0
+; MIPS64R5-DAG: sll ${{[0-9]+}}, $5, 0
+
+  %1 = add <2 x i16> %a, %b
+  ret <2 x i16> %1
+}
+
+define <4 x i16> @i16_4(<4 x i16> %a, <4 x i16> %b) {
+; ALL-LABEL: i16_4:
+; MIPS32-DAG: srl ${{[0-9]+}}, $7, 16
+; MIPS32-DAG: srl ${{[0-9]+}}, $6, 16
+; MIPS32-DAG: srl ${{[0-9]+}}, $5, 16
+; MIPS32-DAG: srl ${{[0-9]+}}, $4, 16
+
+; MIPS32R5-DAG: sw $4
+; MIPS32R5-DAG: sw $5
+; MIPS32R5-DAG: sw $6
+; MIPS32R5-DAG: sw $7
+
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $4, 48
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $5, 48
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $4, 32
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $5, 32
+
+; MIPS64R5-DAG: sd $4
+; MIPS64R5-DAG: sd $5
+
+  %1 = add <4 x i16> %a, %b
+  ret <4 x i16> %1
+}
+
+define <8 x i16> @i16_8(<8 x i16> %a, <8 x i16> %b) {
+; ALL-LABEL: i16_8:
+; MIPS32-DAG: lw ${{[0-9]+}}, 16($sp)
+; MIPS32-DAG: lw ${{[0-9]+}}, 20($sp)
+; MIPS32-DAG: lw ${{[0-9]+}}, 24($sp)
+; MIPS32-DAG: lw ${{[0-9]+}}, 28($sp)
+; MIPS32-DAG: srl ${{[0-9]+}}, $7, 16
+; MIPS32-DAG: srl ${{[0-9]+}}, $6, 16
+; MIPS32-DAG: srl ${{[0-9]+}}, $5, 16
+; MIPS32-DAG: srl ${{[0-9]+}}, $4, 16
+
+; MIPS32R5-DAG: lw ${{[0-9]+}}, 16($sp)
+; MIPS32R5-DAG: lw ${{[0-9]+}}, 20($sp)
+; MIPS32R5-DAG: lw ${{[0-9]+}}, 24($sp)
+; MIPS32R5-DAG: lw ${{[0-9]+}}, 28($sp)
+; MIPS32R5-DAG: insert.w $w[[W0:[0-9]+]][0], $4
+; MIPS32R5-DAG: insert.w $w[[W0]][1], $5
+; MIPS32R5-DAG: insert.w $w[[W0]][2], $6
+; MIPS32R5-DAG: insert.w $w[[W0]][3], $7
+
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $6, 48
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $7, 48
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $6, 32
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $7, 32
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $4, 48
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $5, 48
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $4, 32
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $5, 32
+
+; MIPS64R5-DAG: insert.d $w[[W0:[0-9]+]][{{[0-9]}}], $4
+; MIPS64R5-DAG: insert.d $w[[W0]][{{[0-9]}}], $5
+; MIPS64R5-DAG: insert.d $w[[W1:[0-9]+]][{{[0-9]}}], $6
+; MIPS64R5-DAG: insert.d $w[[W1]][{{[0-9]}}], $7
+
+  %1 = add <8 x i16> %a, %b
+  ret <8 x i16> %1
+}
+
+define <2 x i32> @i32_2(<2 x i32> %a, <2 x i32> %b) {
+; ALL-LABEL: i32_2:
+; MIPS32-DAG: addu    $2, $4, $6
+; MIPS32-DAG: addu    $3, $5, $7
+
+; MIPS32R5-DAG: sw $4
+; MIPS32R5-DAG: sw $5
+; MIPS32R5-DAG: sw $6
+; MIPS32R5-DAG: sw $7
+
+; MIPS64-DAG: sll     ${{[0-9]+}}, $4, 0
+; MIPS64-DAG: sll     ${{[0-9]+}}, $5, 0
+
+; MIPS64R5-DAG: sd $4
+; MIPS64R5-DAG: sd $5
+
+  %1 = add <2 x i32> %a, %b
+
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @i32_4(<4 x i32> %a, <4 x i32> %b) {
+; ALL-LABEL: i32_4:
+; MIPS32-DAG: lw ${{[0-9]+}}, 16($sp)
+; MIPS32-DAG: lw ${{[0-9]+}}, 20($sp)
+; MIPS32-DAG: lw ${{[0-9]+}}, 24($sp)
+; MIPS32-DAG: lw ${{[0-9]+}}, 28($sp)
+; MIPS32-DAG: addu $2
+; MIPS32-DAG: addu $3
+; MIPS32-DAG: addu $4
+; MIPS32-DAG: addu $5
+
+; MIPS32R5-DAG: lw ${{[0-9]+}}, 16($sp)
+; MIPS32R5-DAG: lw ${{[0-9]+}}, 20($sp)
+; MIPS32R5-DAG: lw ${{[0-9]+}}, 24($sp)
+; MIPS32R5-DAG: lw ${{[0-9]+}}, 28($sp)
+; MIPS32R5-DAG: insert.w $w[[W0:[0-9]+]][0], $4
+; MIPS32R5-DAG: insert.w $w[[W0]][1], $5
+; MIPS32R5-DAG: insert.w $w[[W0]][2], $6
+; MIPS32R5-DAG: insert.w $w[[W0]][3], $7
+
+; MIPS64-DAG: sll     ${{[0-9]+}}, $4, 0
+; MIPS64-DAG: sll     ${{[0-9]+}}, $5, 0
+; MIPS64-DAG: sll     ${{[0-9]+}}, $6, 0
+; MIPS64-DAG: sll     ${{[0-9]+}}, $7, 0
+; MIPS64-DAG: dsrl    ${{[0-9]+}}, $4, 32
+; MIPS64-DAG: dsrl    ${{[0-9]+}}, $5, 32
+; MIPS64-DAG: dsrl    ${{[0-9]+}}, $6, 32
+; MIPS64-DAG: dsrl    ${{[0-9]+}}, $7, 32
+  %1 = add <4 x i32> %a, %b
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @i64_2(<2 x i64> %a, <2 x i64> %b) {
+; ALL-LABEL: i64_2:
+; MIPS32-DAG: lw ${{[0-9]+}}, 16($sp)
+; MIPS32-DAG: lw ${{[0-9]+}}, 20($sp)
+; MIPS32-DAG: lw ${{[0-9]+}}, 24($sp)
+; MIPS32-DAG: lw ${{[0-9]+}}, 28($sp)
+; MIPS32-DAG: addu $2
+; MIPS32-DAG: addu $3
+; MIPS32-DAG: addu $4
+; MIPS32-DAG: addu $5
+
+; MIPS32R5-DAG: lw ${{[0-9]+}}, 16($sp)
+; MIPS32R5-DAG: lw ${{[0-9]+}}, 20($sp)
+; MIPS32R5-DAG: lw ${{[0-9]+}}, 24($sp)
+; MIPS32R5-DAG: lw ${{[0-9]+}}, 28($sp)
+; MIPS32R5-DAG: insert.w $w[[W0:[0-9]+]][0], $4
+; MIPS32R5-DAG: insert.w $w[[W0]][1], $5
+; MIPS32R5-DAG: insert.w $w[[W0]][2], $6
+; MIPS32R5-DAG: insert.w $w[[W0]][3], $7
+
+; MIPS64-DAG: daddu $2, $4, $6
+; MIPS64-DAG: daddu $3, $5, $7
+
+; MIPS64R5-DAG: insert.d $w[[W0:[0-9]+]][{{[0-9]}}], $4
+; MIPS64R5-DAG: insert.d $w[[W0]][{{[0-9]}}], $5
+; MIPS64R5-DAG: insert.d $w[[W1:[0-9]+]][{{[0-9]}}], $6
+; MIPS64R5-DAG: insert.d $w[[W1]][{{[0-9]}}], $7
+
+  %1 = add <2 x i64> %a, %b
+  ret <2 x i64> %1
+}
+
+; The MIPS vector ABI treats vectors of floats differently to vectors of
+; integers.
+
+; For arguments floating pointer vectors are bitcasted to integer vectors whose
+; elements are of GPR width and where the element count is deduced from
+; the length of the floating point vector divided by the size of the GPRs.
+
+; For returns, integer vectors are passed via the GPR register set, but
+; floating point vectors are returned via a hidden sret pointer.
+
+; For testing purposes we skip returning values here and test them below
+; instead.
+@float_res_v2f32 = external global <2 x float>
+
+define void @float_2(<2 x float> %a, <2 x float> %b) {
+; ALL-LABEL: float_2:
+; MIPS32: mtc1 $7, $f[[F0:[0-9]+]]
+; MIPS32: mtc1 $5, $f[[F1:[0-9]+]]
+; MIPS32: add.s $f[[F2:[0-9]+]], $f[[F1]], $f[[F0]]
+; MIPS32: swc1 $f[[F2]]
+; MIPS32: mtc1 $6, $f[[F3:[0-9]+]]
+; MIPS32: mtc1 $4, $f[[F4:[0-9]+]]
+; MIPS32: add.s $f[[F5:[0-9]+]], $f[[F4]], $f[[F3]]
+; MIPS32: swc1 $f[[F5]]
+
+; MIPS32R5-DAG: sw $4
+; MIPS32R5-DAG: sw $5
+; MIPS32R5-DAG: sw $6
+; MIPS32R5-DAG: sw $7
+
+; MIPS64-DAG: sll $[[R0:[0-9]+]], $4, 0
+; MIPS64-DAG: sll $[[R1:[0-9]+]], $5, 0
+; MIPS64-DAG: mtc1 $[[R0]], $f{{[0-9]+}}
+; MIPS64-DAG: mtc1 $[[R1]], $f{{[0-9]+}}
+; MIPS64-DAG: dsrl $[[R2:[0-9]+]], $4, 32
+; MIPS64-DAG: dsrl $[[R3:[0-9]+]], $5, 32
+; MIPS64-DAG: sll $[[R4:[0-9]+]], $[[R2]], 0
+; MIPS64-DAG: sll $[[R5:[0-9]+]], $[[R3]], 0
+; MIPS64-DAG: mtc1 $[[R4]], $f{{[0-9]+}}
+; MIPS64-DAG: mtc1 $[[R5]], $f{{[0-9]+}}
+
+; MIPS64R5-DAG: sd $4
+; MIPS64R5-DAG: sd $5
+
+  %1 = fadd <2 x float> %a, %b
+  store <2 x float> %1, <2 x float> * @float_res_v2f32
+  ret void
+}
+
+@float_res_v4f32 = external global <4 x float>
+
+; For MSA this case is suboptimal, the 4 loads can be combined into a single
+; ld.w.
+
+define void @float_4(<4 x float> %a, <4 x float> %b) {
+; ALL-LABEL: float_4:
+; MIPS32-DAG: mtc1 $4
+; MIPS32-DAG: mtc1 $5
+; MIPS32-DAG: mtc1 $6
+; MIPS32-DAG: mtc1 $7
+; MIPS32-DAG: lwc1
+; MIPS32-DAG: lwc1
+; MIPS32-DAG: lwc1
+; MIPS32-DAG: lwc1
+
+; MIPS32R5-DAG: lw $[[R1:[0-9]+]], 16($sp)
+; MIPS32R5-DAG: insert.w $w[[W0:[0-9]+]][0], $[[R1]]
+; MIPS32R5-DAG: lw $[[R2:[0-9]+]], 20($sp)
+; MIPS32R5-DAG: insert.w $w[[W0]][1], $[[R2]]
+; MIPS32R5-DAG: lw $[[R3:[0-9]+]], 24($sp)
+; MIPS32R5-DAG: insert.w $w[[W0]][2], $[[R3]]
+; MIPS32R5-DAG: lw $[[R4:[0-9]+]], 28($sp)
+; MIPS32R5-DAG: insert.w $w[[W0]][3], $[[R4]]
+
+; MIPS32R5-DAG: insert.w $w[[W1:[0-9]+]][0], $4
+; MIPS32R5-DAG: insert.w $w[[W1]][1], $5
+; MIPS32R5-DAG: insert.w $w[[W1]][2], $6
+; MIPS32R5-DAG: insert.w $w[[W1]][3], $7
+
+; MIPS64-DAG: sll $[[R0:[0-9]+]], $4, 0
+; MIPS64-DAG: sll $[[R1:[0-9]+]], $5, 0
+; MIPS64-DAG: mtc1 $[[R0]], $f{{[0-9]+}}
+; MIPS64-DAG: mtc1 $[[R1]], $f{{[0-9]+}}
+; MIPS64-DAG: dsrl $[[R2:[0-9]+]], $4, 32
+; MIPS64-DAG: dsrl $[[R3:[0-9]+]], $5, 32
+; MIPS64-DAG: sll $[[R4:[0-9]+]], $[[R2]], 0
+; MIPS64-DAG: sll $[[R5:[0-9]+]], $[[R3]], 0
+; MIPS64-DAG: mtc1 $[[R4]], $f{{[0-9]+}}
+; MIPS64-DAG: mtc1 $[[R5]], $f{{[0-9]+}}
+; MIPS64-DAG: sll $[[R6:[0-9]+]], $6, 0
+; MIPS64-DAG: sll $[[R7:[0-9]+]], $7, 0
+; MIPS64-DAG: mtc1 $[[R6]], $f{{[0-9]+}}
+; MIPS64-DAG: mtc1 $[[R7]], $f{{[0-9]+}}
+; MIPS64-DAG: dsrl $[[R8:[0-9]+]], $6, 32
+; MIPS64-DAG: dsrl $[[R9:[0-9]+]], $7, 32
+; MIPS64-DAG: sll $[[R10:[0-9]+]], $[[R8]], 0
+; MIPS64-DAG: sll $[[R11:[0-9]+]], $[[R9]], 0
+; MIPS64-DAG: mtc1 $[[R10]], $f{{[0-9]+}}
+; MIPS64-DAG: mtc1 $[[R11]], $f{{[0-9]+}}
+
+; MIPS64R5-DAG: insert.d $w[[W0:[0-9]+]][{{[0-9]}}], $4
+; MIPS64R5-DAG: insert.d $w[[W0]][{{[0-9]}}], $5
+; MIPS64R5-DAG: insert.d $w[[W1:[0-9]+]][{{[0-9]}}], $6
+; MIPS64R5-DAG: insert.d $w[[W1]][{{[0-9]}}], $7
+
+  %1 = fadd <4 x float> %a, %b
+  store <4 x float> %1, <4 x float> * @float_res_v4f32
+  ret void
+}
+
+@double_v2f64 = external global <2 x double>
+
+define void @double_2(<2 x double> %a, <2 x double> %b) {
+; ALL-LABEL: double_2:
+; MIPS32-DAG: sw $7
+; MIPS32-DAG: sw $6
+; MIPS32-DAG: ldc1
+; MIPS32-DAG: ldc1
+; MIPS32:     add.d
+; MIPS32-DAG: sw $5
+; MIPS32-DAG: sw $4
+; MIPS32-DAG: ldc1
+; MIPS32-DAG: ldc1
+; MIPS32:     add.d
+
+; MIPS32R5-DAG: lw $[[R1:[0-9]+]], 16($sp)
+; MIPS32R5-DAG: insert.w $w[[W0:[0-9]+]][0], $[[R1]]
+; MIPS32R5-DAG: lw $[[R2:[0-9]+]], 20($sp)
+; MIPS32R5-DAG: insert.w $w[[W0]][1], $[[R2]]
+; MIPS32R5-DAG: lw $[[R3:[0-9]+]], 24($sp)
+; MIPS32R5-DAG: insert.w $w[[W0]][2], $[[R3]]
+; MIPS32R5-DAG: lw $[[R4:[0-9]+]], 28($sp)
+; MIPS32R5-DAG: insert.w $w[[W0]][3], $[[R4]]
+
+; MIPS32R5-DAG: insert.w $w[[W1:[0-9]+]][0], $4
+; MIPS32R5-DAG: insert.w $w[[W1]][1], $5
+; MIPS32R5-DAG: insert.w $w[[W1]][2], $6
+; MIPS32R5-DAG: insert.w $w[[W1]][3], $7
+
+; MIPS64-DAG: dmtc1 $6, $f[[R0:[0-9]+]]
+; MIPS64-DAG: dmtc1 $4, $f[[R1:[0-9]+]]
+; MIPS64-DAG: add.d $f[[R2:[0-9]+]], $f[[R1]], $f[[R0]]
+; MIPS64-DAG: dmtc1 $7, $f[[R3:[0-9]+]]
+; MIPS64-DAG: dmtc1 $5, $f[[R4:[0-9]+]]
+; MIPS64-DAG: add.d $f[[R5:[0-9]+]], $f[[R4]], $f[[R3]]
+
+; MIPS64R5-DAG: insert.d $w[[W0:[0-9]+]][{{[0-9]}}], $4
+; MIPS64R5-DAG: insert.d $w[[W0]][{{[0-9]}}], $5
+; MIPS64R5-DAG: insert.d $w[[W1:[0-9]+]][{{[0-9]}}], $6
+; MIPS64R5-DAG: insert.d $w[[W1]][{{[0-9]}}], $7
+
+  %1 = fadd <2 x double> %a, %b
+  store <2 x double> %1, <2 x double> * @double_v2f64
+  ret void
+}
+
+; Return value testing.
+; Integer vectors are returned in $2, $3, $4, $5 for O32, $2, $3 for N32/N64
+; Floating point vectors are returned through a hidden sret pointer.
+
+@gv2i8 = global <2 x i8> <i8 1, i8 2>
+@gv4i8 = global <4 x i8> <i8 0, i8 1, i8 2, i8 3>
+@gv8i8 = global <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
+@gv16i8 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>
+
+@gv2i16 = global <2 x i16> <i16 1, i16 2>
+@gv4i16 = global <4 x i16> <i16 0, i16 1, i16 2, i16 3>
+@gv8i16 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+
+@gv2i32 = global <2 x i32> <i32 0, i32 1>
+@gv4i32 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+
+@gv2i64 = global <2 x i64> <i64 0, i64 1>
+
+define <2 x i8> @ret_2_i8() {
+; ALL-LABEL: ret_2_i8:
+; MIPS32-DAG:   lhu $2
+; MIPS32R5-DAG: lhu $2
+
+; FIXME: why is this lh instead of lhu on mips64?
+
+; MIPS64-DAG:  lh $2
+; MIPS64-DAG:  lh $2
+  %1 = load <2 x i8>, <2 x i8> * @gv2i8
+  ret <2 x i8> %1
+}
+
+define <4 x i8> @ret_4_i8() {
+; ALL-LABEL: ret_4_i8:
+; MIPS32-DAG:   lw $2
+; MIPS32R5-DAG: lw $2
+
+; MIPS64-DAG:   lw $2
+; MIPS64R5-DAG: lw $2
+
+  %1 = load <4 x i8>, <4 x i8> * @gv4i8
+  ret <4 x i8> %1
+}
+
+define <8 x i8> @ret_8_i8() {
+; ALL-LABEL: ret_8_i8:
+; MIPS32-DAG:   lw $2
+; MIPS32-DAG:   lw $3
+
+; MIPS32R5: copy_s.w $2, $w[[W0:[0-9]+]]
+; MIPS32R5: copy_s.w $3, $w[[W0]]
+
+; MIPS64-DAG:   ld $2
+; MIPS64R5-DAG: ld $2
+  %1 = load <8 x i8>, <8 x i8> * @gv8i8
+  ret <8 x i8> %1
+}
+
+define <16 x i8> @ret_16_i8() {
+; ALL-LABEL: ret_16_i8:
+; MIPS32-DAG: lw $2
+; MIPS32-DAG: lw $3
+; MIPS32-DAG: lw $4
+; MIPS32-DAG: lw $5
+
+; MIPS32R5-DAG: copy_s.w $2, $w[[W0:[0-9]+]][0]
+; MIPS32R5-DAG: copy_s.w $3, $w[[W0]][1]
+; MIPS32R5-DAG: copy_s.w $4, $w[[W0]][2]
+; MIPS32R5-DAG: copy_s.w $5, $w[[W0]][3]
+
+; MIPS64-DAG: ld $2
+; MIPS64-DAG: ld $3
+
+; MIPS64R5-DAG: copy_s.d $2
+; MIPS64R5-DAG: copy_s.d $3
+
+  %1 = load <16 x i8>, <16 x i8> * @gv16i8
+  ret <16 x i8> %1
+}
+
+define <2 x i16> @ret_2_i16() {
+; ALL-LABEL: ret_2_i16:
+; MIPS32-DAG:   lw $2
+
+; MIPS32R5-DAG: lw $2
+
+; MIPS64-DAG:   lw $2
+
+; MIPS64R5-DAG: lw $2
+  %1 = load <2 x i16>, <2 x i16> * @gv2i16
+  ret <2 x i16> %1
+}
+
+define <4 x i16> @ret_4_i16() {
+; ALL-LABEL: ret_4_i16:
+; MIPS32-DAG: lw $2
+; MIPS32-DAG: lw $3
+
+; MIPS32R5-DAG: copy_s.w $2, $w[[W0:[0-9]+]]
+; MIPS32R5-DAG: copy_s.w $3, $w[[W0]]
+
+; MIPS64-DAG:   ld $2
+; MIPS64R5-DAG: ld $2
+  %1 = load <4 x i16>, <4 x i16> * @gv4i16
+  ret <4 x i16> %1
+}
+
+define <8 x i16> @ret_8_i16() {
+; ALL-LABEL: ret_8_i16:
+; MIPS32-DAG: lw $2
+; MIPS32-DAG: lw $3
+; MIPS32-DAG: lw $4
+; MIPS32-DAG: lw $5
+
+; MIPS32R5-DAG: copy_s.w $2, $w[[W0:[0-9]+]][0]
+; MIPS32R5-DAG: copy_s.w $3, $w[[W0]][1]
+; MIPS32R5-DAG: copy_s.w $4, $w[[W0]][2]
+; MIPS32R5-DAG: copy_s.w $5, $w[[W0]][3]
+
+; MIPS64-DAG: ld $2
+; MIPS64-DAG: ld $3
+
+; MIPS64R5-DAG: copy_s.d $2
+; MIPS64R5-DAG: copy_s.d $3
+
+  %1 = load <8 x i16>, <8 x i16> * @gv8i16
+  ret <8 x i16> %1
+}
+
+define <2 x i32> @ret_2_i32() {
+; ALL-LABEL: ret_2_i32:
+; MIPS32-DAG: lw $2
+; MIPS32-DAG: lw $3
+
+; MIPS32R5-DAG: copy_s.w $2, $w[[W0:[0-9]+]]
+; MIPS32R5-DAG: copy_s.w $3, $w[[W0]]
+
+; MIPS64-DAG:   ld $2
+; MIPS64R5-DAG: ld $2
+
+  %1 = load <2 x i32>, <2 x i32> * @gv2i32
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @ret_4_i32() {
+; ALL-LABEL: ret_4_i32:
+; MIPS32-DAG: lw $2
+; MIPS32-DAG: lw $3
+; MIPS32-DAG: lw $4
+; MIPS32-DAG: lw $5
+
+; MIPS32R5-DAG: copy_s.w $2, $w[[W0:[0-9]+]][0]
+; MIPS32R5-DAG: copy_s.w $3, $w[[W0]][1]
+; MIPS32R5-DAG: copy_s.w $4, $w[[W0]][2]
+; MIPS32R5-DAG: copy_s.w $5, $w[[W0]][3]
+
+; MIPS64-DAG: ld $2
+; MIPS64-DAG: ld $3
+
+; MIPS64R5-DAG: copy_s.d $2, $w[[W0:[0-9]+]]
+; MIPS64R5-DAG: copy_s.d $3, $w[[W0]]
+
+  %1 = load <4 x i32>, <4 x i32> * @gv4i32
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @ret_2_i64() {
+; ALL-LABEL: ret_2_i64:
+; MIPS32-DAG: lw $2
+; MIPS32-DAG: lw $3
+; MIPS32-DAG: lw $4
+; MIPS32-DAG: lw $5
+
+; MIPS32R5-DAG: copy_s.w $2, $w[[W0:[0-9]+]][0]
+; MIPS32R5-DAG: copy_s.w $3, $w[[W0]][1]
+; MIPS32R5-DAG: copy_s.w $4, $w[[W0]][2]
+; MIPS32R5-DAG: copy_s.w $5, $w[[W0]][3]
+
+; MIPS64-DAG: ld $2
+; MIPS64-DAG: ld $3
+
+; MIPS64R5-DAG: copy_s.d $2, $w[[W0:[0-9]+]]
+; MIPS64R5-DAG: copy_s.d $3, $w[[W0]]
+
+  %1 = load <2 x i64>, <2 x i64> * @gv2i64
+  ret <2 x i64> %1
+}
+
+@gv2f32 = global <2 x float> <float 0.0, float 0.0>
+@gv4f32 = global <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>
+
+define <2 x float> @ret_float_2() {
+entry:
+; ALL-LABEL: ret_float_2:
+
+; MIPS32-DAG: swc1 $f{{[0-9]+}}, 0($4)
+; MIPS32-DAG: swc1 $f{{[0-9]+}}, 4($4)
+
+; MIPS32R5-DAG: swc1 $f{{[0-9]+}}, 0($4)
+; MIPS32R5-DAG: swc1 $f{{[0-9]+}}, 4($4)
+
+; MIPS64: ld $2
+
+; MIPS64R5: ld $2
+
+  %0 = load <2 x float>, <2 x float> * @gv2f32
+  ret <2 x float> %0
+}
+
+define <4 x float> @ret_float_4() {
+entry:
+; ALL-LABEL: ret_float_4:
+
+; MIPS32-DAG: swc1 $f{{[0-9]+}}, 0($4)
+; MIPS32-DAG: swc1 $f{{[0-9]+}}, 4($4)
+; MIPS32-DAG: swc1 $f{{[0-9]+}}, 8($4)
+; MIPS32-DAG: swc1 $f{{[0-9]+}}, 12($4)
+
+; MIPS32R5: st.w $w{{[0-9]+}}, 0($4)
+
+; MIPS64-DAG: ld $2
+; MIPS64-DAG: ld $3
+
+; MIPS64R5-DAG: copy_s.d $2, $w{{[0-9]+}}[0]
+; MIPS64R5-DAG: copy_s.d $3, $w{{[0-9]+}}[1]
+
+  %0 = load <4 x float>, <4 x float> * @gv4f32
+  ret <4 x float> %0
+}
+
+@gv2f64 = global <2 x double> <double 0.0, double 0.0>
+
+define <2 x double> @ret_double_2() {
+entry:
+; ALL-LABEL: ret_double_2:
+
+; MIPS32-DAG: sdc1 $f{{[0-9]+}}, 8($4)
+; MIPS32-DAG: sdc1 $f{{[0-9]+}}, 0($4)
+
+; MIPS32R5: st.d $w{{[0-9]+}}, 0($4)
+
+; MIPS64-DAG: ld $2
+; MIPS64-DAG: ld $2
+
+; MIPS64R5-DAG: copy_s.d $2, $w{{[0-9]+}}[0]
+; MIPS64R5-DAG: copy_s.d $3, $w{{[0-9]+}}[1]
+
+  %0 = load <2 x double>, <2 x double> * @gv2f64
+  ret <2 x double> %0
+}
+
+; Test argument lowering and call result lowering.
+
+define void @call_i8_2() {
+entry:
+; ALL-LABEL: call_i8_2:
+; MIPS32EB-DAG: addiu $4
+; MIPS32EB-DAG: addiu $5
+; MIPS32-NOT: addiu $6
+; MIPS32-NOT: addiu $7
+
+; MIPS32R5-DAG: lhu $4, {{[0-9]+}}($sp)
+; MIPS32R5-DAG: lhu $5, {{[0-9]+}}($sp)
+
+; MIPS32R5: jal
+; MIPS32R5: sw $2, {{[0-9]+}}($sp)
+
+; MIPS32R5-DAG: sb ${{[0-9]+}}, 1(${{[0-9]+}})
+; MIPS32R5-DAG; sb ${{[0-9]+}}, %lo(gv2i8)(${{[0-9]+}})
+
+; MIPS64EB: daddiu $4, $zero, 1543
+; MIPS64EB: daddiu $5, $zero, 3080
+
+; MIPS64EL: daddiu $4, $zero, 1798
+; MIPS64EL; daddiu $5, $zero, 2060
+
+; MIPS64R5-DAG: lh $4
+; MIPS64R5-DAG: lh $5
+
+; MIPS32: jal i8_2
+; MIPS64: jalr $25
+
+; MIPS32EB-DAG: srl $[[R0:[0-9]+]], $2, 16
+; MIPS32EB-DAG: sb $[[R0]]
+; MIPS32EB-DAG: srl $[[R1:[0-9]+]], $2, 24
+; MIPS32EB-DAG: sb $[[R1]]
+
+; MIPS32EL: sb $2
+; MIPS32EL: srl $[[R0:[0-9]+]], $2, 8
+; MIPS32EL: sb $[[R0]]
+
+; MIPS64EB: dsrl $[[R4:[0-9]+]], $2, 48
+; MIPS64EB: sb $[[R4]]
+; MIPS64EB: dsrl $[[R5:[0-9]+]], $2, 56
+; MIPS64EB: sb $[[R5]]
+
+; MIPS64EL: sll $[[R6:[0-9]+]], $2, 0
+; MIPS64EL: sb $[[R6]]
+; MIPS64EL: srl $[[R7:[0-9]+]], $[[R6]], 8
+; MIPS64EL: sb $[[R7]]
+
+; MIPS64R5: sd $2
+
+  %0 = call <2 x i8> @i8_2(<2 x i8> <i8 6, i8 7>, <2 x i8> <i8 12, i8 8>)
+  store <2 x i8> %0, <2 x i8> * @gv2i8
+  ret void
+}
+
+define void @call_i8_4() {
+entry:
+; ALL-LABEL: call_i8_4:
+; MIPS32: ori $4
+; MIPS32: ori $5
+; MIPS32-NOT: ori $6
+; MIPS32-NOT: ori $7
+
+; MIPS32R5-DAG: lw $4, {{[0-9]+}}($sp)
+; MIPS32R5-DAG: lw $5, {{[0-9]+}}($sp)
+
+; MIPS64: ori $4
+; MIPS64: ori $5
+
+; MIPS64R5: lw $4
+; MIPS64R5: lw $5
+
+; MIPS32: jal i8_4
+; MIPS64: jalr $25
+
+; MIPS32: sw $2
+
+; MIPS32R5-DAG: sw $2
+
+; MIPS64: sw $2
+; MIPS64R5: sw $2
+
+  %0 = call <4 x i8> @i8_4(<4 x i8> <i8 6, i8 7, i8 9, i8 10>, <4 x i8> <i8 12, i8 8, i8 9, i8 10>)
+  store <4 x i8> %0, <4 x i8> * @gv4i8
+  ret void
+}
+
+define void @call_i8_8() {
+entry:
+; ALL-LABEL: call_i8_8:
+
+; MIPS32: ori $6
+; MIPS32: ori $4
+; MIPS32: move  $5
+; MIPS32: move  $7
+
+; MIPS32R5-DAG: ori $6
+; MIPS32R5-DAG: ori $4
+; MIPS32R5-DAG: move  $5
+; MIPS32R5-DAG: move  $7
+
+; MIPS64EB: daddiu $4, ${{[0-9]+}}, 2314
+; MIPS64EB: daddiu $5, ${{[0-9]+}}, 2314
+
+; MIPS64EL: daddiu $4, ${{[0-9]+}}, 1798
+; MIPS64EL: daddiu $5, ${{[0-9]+}}, 2060
+
+; MIPS32: jal i8_8
+; MIPS64: jalr $25
+
+; MIPS32-DAG: sw $2
+; MIPS32-DAG: sw $3
+
+; MIPS32R5-DAG: sw $2
+; MIPS32R5-DAG: sw $3
+
+; MIPS64: sd $2
+; MIPS64R5: sd $2
+
+  %0 = call <8 x i8> @i8_8(<8 x i8> <i8 6, i8 7, i8 9, i8 10, i8 6, i8 7, i8 9, i8 10>, <8 x i8> <i8 12, i8 8, i8 9, i8 10, i8 6, i8 7, i8 9, i8 10>)
+  store <8 x i8> %0, <8 x i8> * @gv8i8
+  ret void
+}
+
+define void @calli8_16() {
+entry:
+; ALL-LABEL: calli8_16:
+; MIPS32-DAG: sw  ${{[0-9]+}}, 28($sp)
+; MIPS32-DAG: sw  ${{[0-9]+}}, 24($sp)
+; MIPS32-DAG: sw  ${{[0-9]+}}, 20($sp)
+; MIPS32-DAG: sw  ${{[0-9]+}}, 16($sp)
+
+; MIPS32: ori $4, ${{[0-9]+}}, {{[0-9]+}}
+; MIPS32: ori $7, ${{[0-9]+}}, {{[0-9]+}}
+; MIPS32: move  $5, ${{[0-9]+}}
+; MIPS32: move  $6, ${{[0-9]+}}
+
+; MIPS32R5-DAG: copy_s.w $4, $w{{[0-9]+}}
+; MIPS32R5-DAG: copy_s.w $5, $w{{[0-9]+}}
+; MIPS32R5-DAG: copy_s.w $6, $w{{[0-9]+}}
+; MIPS32R5-DAG: copy_s.w $7, $w{{[0-9]+}}
+
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 28($sp)
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 24($sp)
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 20($sp)
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 16($sp)
+
+; MIPS64-DAG: daddiu $4
+; MIPS64-DAG: daddiu $5
+; MIPS64-DAG: daddiu $6
+; MIPS64-DAG: daddiu $7
+
+; MIPS64R5-DAG: copy_s.d $4
+; MIPS64R5-DAG: copy_s.d $5
+; MIPS64R5-DAG: copy_s.d $6
+; MIPS64R5-DAG: copy_s.d $7
+
+; MIPS32: jal i8_16
+; MIPS64: jalr $25
+
+; MIPS32-DAG: sw $5, 12(${{[0-9]+}})
+; MIPS32-DAG: sw $4, 8(${{[0-9]+}})
+; MIPS32-DAG: sw $3, 4(${{[0-9]+}})
+; MIPS32-DAG: sw $2, %lo(gv16i8)(${{[0-9]+}})
+
+; MIPS32R5-DAG: insert.w $w[[W0:[0-9]+]][0], $2
+; MIPS32R5-DAG: insert.w $w[[W0]][1], $3
+; MIPS32R5-DAG: insert.w $w[[W0]][2], $4
+; MIPS32R5-DAG: insert.w $w[[W0]][3], $5
+; MIPS32R5-DAG: st.w $w[[W0]]
+
+; MIPS64-DAG: sd $3
+; MIPS64-DAG: sd $2
+
+; MIPS64R5-DAG: insert.d $w[[W0:[0-9]+]][0], $2
+; MIPS64R5-DAG: insert.d $w[[W0:[0-9]+]][1], $3
+
+  %0 = call <16 x i8> @i8_16(<16 x i8> <i8 6, i8 7,i8 6, i8 7,i8 6, i8 7,i8 6, i8 7,i8 6, i8 7,i8 6, i8 7, i8 6, i8 7, i8 9, i8 10>, <16 x i8> <i8 7, i8 9,i8 7, i8 9,i8 7, i8 9,i8 7, i8 9,i8 7, i8 9,i8 7, i8 9,i8 12, i8 8, i8 9, i8 10>)
+  store <16 x i8> %0, <16 x i8> * @gv16i8
+  ret void
+}
+
+define void @calli16_2() {
+entry:
+; ALL-LABEL: calli16_2:
+
+; MIPS32-DAG: ori $4
+; MIPS32-DAG: ori $5
+
+; MIPS32R5-DAG: lw $4
+; MIPS32R5-DAG: lw $5
+
+; MIPS64: ori $4
+; MIPS64: ori $5
+
+; MIPS64R5-DAG: lw $4
+; MIPS64R5-DAG: lw $5
+
+; MIPS32: jal i16_2
+; MIPS64: jalr $25
+
+; MIPS32: sw $2, %lo(gv2i16)
+
+; MIPS32R5: sw $2, %lo(gv2i16)
+
+; MIPS64: sw $2
+
+; MIPS64R6: sw $2
+
+  %0 = call <2 x i16> @i16_2(<2 x i16> <i16 6, i16 7>, <2 x i16> <i16 12, i16 8>)
+  store <2 x i16> %0, <2 x i16> * @gv2i16
+  ret void
+}
+
+define void @calli16_4() {
+entry:
+; ALL-LABEL: calli16_4:
+; MIPS32-DAG: ori $4
+; MIPS32-DAG: ori $5
+; MIPS32-DAG: ori $6
+; MIPS32-DAG: move $7
+
+; MIPS32R5-DAG: ori $4
+; MIPS32R5-DAG: ori $5
+; MIPS32R5-DAG: ori $6
+; MIPS32R5-DAG: move $7
+
+; MIPS64-DAG: daddiu $4
+; MIPS64-DAG: daddiu $5
+
+; MIPS64R5-DAG: ld $4
+; MIPS64R5-DAG: ld $5
+
+; MIPS32: jal i16_4
+; MIPS64: jalr $25
+
+; MIPS32-DAG: sw $3, 4(${{[0-9]+}})
+; MIPS32-DAG: sw $2, %lo(gv4i16)(${{[0-9]+}})
+
+; MIPS32R5-DAG: sw $3, 4(${{[0-9]+}})
+; MIPS32R5-DAG: sw $2, %lo(gv4i16)(${{[0-9]+}})
+
+; MIPS64: sd $2
+; MIPS64R5: sd $2
+
+  %0 = call <4 x i16> @i16_4(<4 x i16> <i16 6, i16 7, i16 9, i16 10>, <4 x i16> <i16 12, i16 8, i16 9, i16 10>)
+  store <4 x i16> %0, <4 x i16> * @gv4i16
+  ret void
+}
+
+define void @calli16_8() {
+entry:
+; ALL-LABEL: calli16_8:
+
+; MIPS32-DAG: sw  ${{[0-9]+}}, 28($sp)
+; MIPS32-DAG: sw  ${{[0-9]+}}, 24($sp)
+; MIPS32-DAG: sw  ${{[0-9]+}}, 20($sp)
+; MIPS32-DAG: sw  ${{[0-9]+}}, 16($sp)
+
+; MIPS32-DAG: ori $4, ${{[0-9]+}}, {{[0-9]+}}
+; MIPS32-DAG: ori $5, ${{[0-9]+}}, {{[0-9]+}}
+; MIPS32-DAG: move  $6, ${{[0-9]+}}
+; MIPS32-DAG: move  $7, ${{[0-9]+}}
+
+; MIPS32R5-DAG: copy_s.w $4, $w{{[0-9]+}}
+; MIPS32R5-DAG: copy_s.w $5, $w{{[0-9]+}}
+; MIPS32R5-DAG: copy_s.w $6, $w{{[0-9]+}}
+; MIPS32R5-DAG: copy_s.w $7, $w{{[0-9]+}}
+
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 28($sp)
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 24($sp)
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 20($sp)
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 16($sp)
+
+; MIPS64-DAG: daddiu $4
+; MIPS64-DAG: daddiu $7
+; MIPS64-DAG: move $5
+; MIPS64-DAG: move $6
+
+; MIPS64R5-DAG: copy_s.d $4, $w[[W0:[0-9]+]][0]
+; MIPS64R5-DAG: copy_s.d $5, $w[[W0]][1]
+; MIPS64R5-DAG: copy_s.d $6, $w[[W1:[0-9]+]][0]
+; MIPS64R5-DAG: copy_s.d $7, $w[[W1]][1]
+
+; MIPS32: jal i16_8
+; MIPS64: jalr $25
+
+; MIPS32-DAG: sw $5, 12(${{[0-9]+}})
+; MIPS32-DAG: sw $4, 8(${{[0-9]+}})
+; MIPS32-DAG: sw $3, 4(${{[0-9]+}})
+; MIPS32-DAG: sw $2, %lo(gv8i16)(${{[0-9]+}})
+
+; MIPS32R5-DAG: insert.w $w[[W0:[0-9]+]][0], $2
+; MIPS32R5-DAG: insert.w $w[[W0]][1], $3
+; MIPS32R5-DAG: insert.w $w[[W0]][2], $4
+; MIPS32R5-DAG: insert.w $w[[W0]][3], $5
+; MIPS32R5-DAG: st.w $w[[W0]]
+
+; MIPS64: sd $3
+; MIPS64: sd $2
+
+; MIPS64R5-DAG: insert.d $w[[W2:[0-9]+]][0], $2
+; MIPS64R5-DAG: insert.d $w[[W2]][1], $3
+
+  %0 = call <8 x i16> @i16_8(<8 x i16> <i16 6, i16 7, i16 9, i16 10, i16 6, i16 7, i16 9, i16 10>, <8 x i16> <i16 6, i16 7, i16 9, i16 10, i16 12, i16 8, i16 9, i16 10>)
+  store <8 x i16> %0, <8 x i16> * @gv8i16
+  ret void
+}
+
+define void @calli32_2() {
+entry:
+; ALL-LABEL: calli32_2:
+
+; MIPS32-DAG: addiu $4
+; MIPS32-DAG: addiu $5
+; MIPS32-DAG: addiu $6
+; MIPS32-DAG: addiu $7
+
+; MIPS32R5-DAG: addiu $4
+; MIPS32R5-DAG: addiu $5
+; MIPS32R5-DAG: addiu $6
+; MIPS32R5-DAG: addiu $7
+
+; MIPS64: daddiu $4
+; MIPS64: daddiu $5
+
+; MIPS64R5-DAG: ld $4
+; MIPS64R5-DAG: ld $5
+
+; MIPS32: jal i32_2
+; MIPS64: jalr $25
+
+; MIPS32-DAG: sw $2, %lo(gv2i32)(${{[0-9]+}})
+; MIPS32-DAG: sw $3, 4(${{[0-9]+}})
+
+; MIPS32R5-DAG: sw $2, %lo(gv2i32)(${{[0-9]+}})
+; MIPS32R5-DAG: sw $3, 4(${{[0-9]+}})
+
+; MIPS64: sd $2
+
+; MIPS64R5: sd $2
+
+  %0 = call <2 x i32> @i32_2(<2 x i32> <i32 6, i32 7>, <2 x i32> <i32 12, i32 8>)
+  store <2 x i32> %0, <2 x i32> * @gv2i32
+  ret void
+}
+
+define void @calli32_4() {
+entry:
+; ALL-LABEL: calli32_4:
+
+; MIPS32-DAG: sw  ${{[0-9]+}}, 28($sp)
+; MIPS32-DAG: sw  ${{[0-9]+}}, 24($sp)
+; MIPS32-DAG: sw  ${{[0-9]+}}, 20($sp)
+; MIPS32-DAG: sw  ${{[0-9]+}}, 16($sp)
+
+; MIPS32-DAG: addiu $4
+; MIPS32-DAG: addiu $5
+; MIPS32-DAG: addiu $6
+; MIPS32-DAG: addiu $7
+
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 28($sp)
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 24($sp)
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 20($sp)
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 16($sp)
+
+; MIPS32R5-DAG: addiu $4
+; MIPS32R5-DAG: addiu $5
+; MIPS32R5-DAG: addiu $6
+; MIPS32R5-DAG: addiu $7
+
+; MIPS64-DAG: daddiu $4
+; MIPS64-DAG: daddiu $6
+; MIPS64-DAG: daddiu $5
+; MIPS64-DAG: move $7
+
+; MIPS64R5-DAG: copy_s.d $4, $w[[W0:[0-9]+]][0]
+; MIPS64R5-DAG: copy_s.d $5, $w[[W0]][1]
+; MIPS64R5-DAG: copy_s.d $6, $w[[W1:[0-9]+]][0]
+; MIPS64R5-DAG: copy_s.d $7, $w[[W1]][1]
+
+; MIPS32: jal i32_4
+; MIPS64: jalr $25
+
+; MIPS32-DAG: sw $5, 12(${{[0-9]+}})
+; MIPS32-DAG: sw $4, 8(${{[0-9]+}})
+; MIPS32-DAG: sw $3, 4(${{[0-9]+}})
+; MIPS32-DAG: sw $2, %lo(gv4i32)(${{[0-9]+}})
+
+; MIPS32R5-DAG: insert.w $w[[W0:[0-9]+]][0], $2
+; MIPS32R5-DAG: insert.w $w[[W0]][1], $3
+; MIPS32R5-DAG: insert.w $w[[W0]][2], $4
+; MIPS32R5-DAG: insert.w $w[[W0]][3], $5
+; MIPS32R5-DAG: st.w $w[[W0]]
+
+; MIPS64-DAG: sd $2
+; MIPS64-DAG: sd $3
+
+; MIPS64R5-DAG: insert.d $w[[W0:[0-9]+]][0], $2
+; MIPS64R6-DAG: insert.d $w[[W0:[0-9]+]][1], $3
+
+  %0 = call <4 x i32> @i32_4(<4 x i32> <i32 6, i32 7, i32 9, i32 10>, <4 x i32> <i32 12, i32 8, i32 9, i32 10>)
+  store <4 x i32> %0, <4 x i32> * @gv4i32
+  ret void
+}
+
+define void @calli64_2() {
+entry:
+; ALL-LABEL: calli64_2:
+
+; MIPS32-DAG: sw  ${{[0-9a-z]+}}, 28($sp)
+; MIPS32-DAG: sw  ${{[0-9a-z]+}}, 24($sp)
+; MIPS32-DAG: sw  ${{[0-9a-z]+}}, 20($sp)
+; MIPS32-DAG: sw  ${{[0-9a-z]+}}, 16($sp)
+
+; MIPS32-DAG: addiu $4
+; MIPS32-DAG: addiu $5
+; MIPS32-DAG: addiu $6
+; MIPS32-DAG: addiu $7
+
+; MIPS32R5-DAG: copy_s.w $4, $w{{[0-9]+}}
+; MIPS32R5-DAG: copy_s.w $5, $w{{[0-9]+}}
+; MIPS32R5-DAG: copy_s.w $6, $w{{[0-9]+}}
+; MIPS32R5-DAG: copy_s.w $7, $w{{[0-9]+}}
+
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 28($sp)
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 24($sp)
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 20($sp)
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 16($sp)
+
+; MIPS64: daddiu $4
+; MIPS64: daddiu $5
+; MIPS64: daddiu $6
+; MIPS64: daddiu $7
+
+; MIPS64R5: daddiu $4
+; MIPS64R5: daddiu $5
+; MIPS64R5: daddiu $6
+; MIPS64R5: daddiu $7
+
+; MIPS32: jal i64_2
+; MIPS64: jalr $25
+
+; MIPS32-DAG: sw $5, 12(${{[0-9]+}})
+; MIPS32-DAG: sw $4, 8(${{[0-9]+}})
+; MIPS32-DAG: sw $3, 4(${{[0-9]+}})
+; MIPS32-DAG: sw $2, %lo(gv2i64)(${{[0-9]+}})
+
+; MIPS32R5-DAG: insert.w $w[[W0:[0-9]+]][0], $2
+; MIPS32R5-DAG: insert.w $w[[W0]][1], $3
+; MIPS32R5-DAG: insert.w $w[[W0]][2], $4
+; MIPS32R5-DAG: insert.w $w[[W0]][3], $5
+; MIPS32R5-DAG: st.w $w[[W0]]
+
+; MIPS64-DAG: sd $3
+; MIPS64-DAG: sd $2
+
+; MIPS64R5-DAG: insert.d $w[[W0:[0-9]+]][0], $2
+; MIPS64R6-DAG: insert.d $w[[W0:[0-9]+]][1], $3
+
+  %0 = call <2 x i64> @i64_2(<2 x i64> <i64 6, i64 7>, <2 x i64> <i64 12, i64 8>)
+  store <2 x i64> %0, <2 x i64> * @gv2i64
+  ret void
+}
+
+declare <2 x float> @float2_extern(<2 x float>, <2 x float>)
+declare <4 x float> @float4_extern(<4 x float>, <4 x float>)
+declare <2 x double> @double2_extern(<2 x double>, <2 x double>)
+
+define void @callfloat_2() {
+entry:
+; ALL-LABEL: callfloat_2:
+
+; MIPS32-DAG: addiu $4, $sp, 24
+; MIPS32-DAG: addiu $6, $zero, 0
+; MIPS32-DAG: lui $7
+
+; MIPS32R5-DAG: addiu $4, $sp, 24
+; MIPS32R5-DAG: addiu $6, $zero, 0
+; MIPS32R5-DAG: lui $7
+
+; MIPS64: dsll $4
+; MIPS64: dsll $5
+
+; MIPS64R5-DAG: copy_s.d $4, $w{{[0-9]+}}
+; MIPS64R5-DAG: copy_s.d $5, $w{{[0-9]+}}
+
+; MIPS32: jal float2_extern
+; MIPS64: jalr $25
+
+; MIPS32-DAG: lwc1 $f[[F0:[0-9]+]], 24($sp)
+; MIPS32-DAG: lwc1 $f[[F1:[0-9]+]], 28($sp)
+
+; MIPS32-DAG: swc1 $f[[F1]], 4(${{[0-9]+}})
+; MIPS32-DAG: swc1 $f[[F0]], %lo(gv2f32)(${{[0-9]+}})
+
+; MIPS32R5-DAG: lwc1 $f[[F0:[0-9]+]], 24($sp)
+; MIPS32R5-DAG: lwc1 $f[[F1:[0-9]+]], 28($sp)
+
+; MIPS32R5-DAG: swc1 $f[[F1]], 4(${{[0-9]+}})
+; MIPS32R5-DAG: swc1 $f[[F0]], %lo(gv2f32)(${{[0-9]+}})
+
+; MIPS64: sd $2
+
+; MIPS64R5: sd $2
+
+  %0 = call <2 x float> @float2_extern(<2 x float> <float 0.0, float -1.0>, <2 x float> <float 12.0, float 14.0>)
+  store <2 x float> %0, <2 x float> * @gv2f32
+  ret void
+}
+
+define void @callfloat_4() {
+entry:
+; ALL-LABEL: callfloat_4:
+
+; MIPS32: sw ${{[0-9]+}}, 36($sp)
+; MIPS32: sw ${{[0-9]+}}, 32($sp)
+; MIPS32: sw ${{[0-9]+}}, 28($sp)
+; MIPS32: sw ${{[0-9]+}}, 24($sp)
+; MIPS32: sw ${{[0-9]+}}, 20($sp)
+; MIPS32: sw ${{[0-9]+}}, 16($sp)
+; MIPS32: addiu $4, $sp, 48
+; MIPS32: addiu $6, $zero, 0
+; MIPS32: lui $7
+
+; MIPS32R5: copy_s.w $6, $w{{[0-9]+}}
+; MIPS32R5: copy_s.w $7, $w{{[0-9]+}}
+; MIPS32R5: sw ${{[0-9]+}}, 36($sp)
+; MIPS32R5: sw ${{[0-9]+}}, 32($sp)
+; MIPS32R5: sw ${{[0-9]+}}, 28($sp)
+; MIPS32R5: sw ${{[0-9]+}}, 24($sp)
+; MIPS32R5: sw ${{[0-9]+}}, 20($sp)
+; MIPS32R5: sw ${{[0-9]+}}, 16($sp)
+; MIPS32R5: addiu $4, $sp, 48
+
+; MIPS64-DAG: dsll $4
+; MIPS64-DAG: dsll $5
+; MIPS64-DAG: dsll $6
+; MIPS64-DAG: dsll $7
+
+; MIPS64R5-DAG: copy_s.d $4, $w{{[0-9]+}}
+; MIPS64R5-DAG: copy_s.d $5, $w{{[0-9]+}}
+; MIPS64R5-DAG: copy_s.d $6, $w{{[0-9]+}}
+; MIPS64R5-DAG: copy_s.d $7, $w{{[0-9]+}}
+
+; MIPS64: jalr $25
+; MIPS32: jal
+
+; MIPS32-DAG: lwc1 $f{{[0-9]+}}, 48($sp)
+; MIPS32-DAG: lwc1 $f{{[0-9]+}}, 52($sp)
+; MIPS32-DAG: lwc1 $f{{[0-9]+}}, 56($sp)
+; MIPS32-DAG: lwc1 $f{{[0-9]+}}, 60($sp)
+
+; MIPS32R5: ld.w $w{{[0-9]+}}, 48($sp)
+
+; MIPS64-DAG: $2
+; MIPS64-DAG: $3
+
+; MIPS64R5-DAG: insert.d $w[[W0:[0-9]+]][0], $2
+; MIPS64R5-DAG: insert.d $w[[W0:[0-9]+]][1], $3
+
+  %0 = call <4 x float> @float4_extern(<4 x float> <float 0.0, float -1.0, float 2.0, float 4.0>, <4 x float> <float 12.0, float 14.0, float 15.0, float 16.0>)
+  store <4 x float> %0, <4 x float> * @gv4f32
+  ret void
+}
+
+define void @calldouble_2() {
+entry:
+; ALL-LABEL: calldouble_2:
+
+; MIPS32-DAG: sw ${{[0-9a-z]+}}, 36($sp)
+; MIPS32-DAG: sw ${{[0-9a-z]+}}, 32($sp)
+; MIPS32-DAG: sw ${{[0-9a-z]+}}, 28($sp)
+; MIPS32-DAG: sw ${{[0-9a-z]+}}, 24($sp)
+; MIPS32-DAG: sw ${{[0-9a-z]+}}, 20($sp)
+; MIPS32-DAG: sw ${{[0-9a-z]+}}, 16($sp)
+
+; MIPS32-DAG: addiu $4, $sp, [[R0:[0-9]+]]
+; MIPS32-DAG: addiu $6, $zero, 0
+; MIPS32-DAG: addiu $7, $zero, 0
+
+; MIPS32R5-DAG: copy_s.w $4, $w{{[0-9]+}}
+; MIPS32R5-DAG: copy_s.w $5, $w{{[0-9]+}}
+; MIPS32R5-DAG: copy_s.w $6, $w{{[0-9]+}}
+; MIPS32R5-DAG: copy_s.w $7, $w{{[0-9]+}}
+
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 36($sp)
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 32($sp)
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 28($sp)
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 24($sp)
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 20($sp)
+; MIPS32R5-DAG: sw  ${{[0-9]+}}, 16($sp)
+
+; MIPS64-DAG: dsll $5
+; MIPS64-DAG: dsll $6
+; MIPS64-DAG: dsll $7
+; MIPS64-DAG: daddiu $4
+
+; MIPS64R5-DAG: copy_s.d $4, $w{{[0-9]+}}
+; MIPS64R5-DAG: copy_s.d $5, $w{{[0-9]+}}
+; MIPS64R5-DAG: copy_s.d $6, $w{{[0-9]+}}
+; MIPS64R5-DAG: copy_s.d $7, $w{{[0-9]+}}
+
+; MIPS32: jal double2_extern
+; MIPS64: jalr $25
+
+; MIPS32-DAG: ldc1 $f[[F0:[0-9]+]], 48($sp)
+; MIPS32-DAG: ldc1 $f[[F1:[0-9]+]], 56($sp)
+
+; MIPS32-DAG: sdc1 $f[[F1]], 8(${{[0-9]+}})
+; MIPS32-DAG: sdc1 $f[[F0]], %lo(gv2f64)(${{[0-9]+}})
+
+; MIPS32R5: ld.d $w[[W0:[0-9]+]], 48($sp)
+; MIPS32R5: st.d $w[[W0]], 0(${{[0-9]+}})
+
+; MIPS64-DAG: sd $2
+; MIPS64-DAG: sd $3
+
+; MIPS64R5-DAG: insert.d $w[[W0:[0-9]+]][0], $2
+; MIPS64R5-DAG: insert.d $w[[W0:[0-9]+]][1], $3
+
+  %0 = call <2 x double> @double2_extern(<2 x double> <double 0.0, double -1.0>, <2 x double> <double 12.0, double 14.0>)
+  store <2 x double> %0, <2 x double> * @gv2f64
+  ret void
+}
+
+; The mixed tests show that due to alignment requirements, $5 is not used
+; in argument passing.
+
+define float @mixed_i8(<2 x float> %a, i8 %b, <2 x float> %c) {
+entry:
+; ALL-LABEL: mixed_i8:
+
+; MIPS32-DAG: mtc1 $5, $f{{[0-9]+}}
+; MIPS32: andi $[[R7:[0-9]+]], $6, 255
+; MIPS32: mtc1 $[[R7]], $f[[F0:[0-9]+]]
+; MIPS32: cvt.s.w $f{{[0-9]+}}, $f[[F0]]
+
+; MIPS32-DAG: mtc1 $4, $f{{[0-9]+}}
+; MIPS32-DAG: lwc1 $f{{[0-9]+}}, 16($sp)
+; MIPS32-DAG: lwc1 $f{{[0-9]+}}, 20($sp)
+; MIPS32-DAG: add.s $f0, $f{{[0-9]+}}, $f{{[0-9]+}}
+
+; MIPS32R5: andi $[[R0:[0-9]+]], $6, 255
+; MIPS32R5: sw $[[R0]], {{[0-9]+}}($sp)
+; MIPS32R5: sw $[[R0]], {{[0-9]+}}($sp)
+; MIPS32R5-DAG: sw $5, {{[0-9]+}}($sp)
+; MIPS32R5-DAG: sw $4, {{[0-9]+}}($sp)
+
+; MIPS64EB-DAG: sll $[[R0:[0-9]+]], $4, 0
+; MIPS64EB-DAG: mtc1 $[[R0]], $f{{[0-9]+}}
+; MIPS64EB: sll $[[R6:[0-9]+]], $5, 0
+; MIPS64EB: andi $[[R7:[0-9]+]], $[[R6]], 255
+; MIPS64EB: mtc1 $[[R7]], $f[[F0:[0-9]+]]
+; MIPS64EB: cvt.s.w $f{{[0-9]+}}, $f[[F0]]
+
+; MIPS64EB-DAG: dsrl $[[R1:[0-9]+]], $4, 32
+; MIPS64EB-DAG: sll $[[R2:[0-9]+]], $[[R1]], 0
+; MIPS64EB-DAG: mtc1 $[[R2:[0-9]+]], $f{{[0-9]+}}
+
+; MIPS64EB-DAG: sll $[[R3:[0-9]+]], $6, 0
+; MIPS64EB-DAG: mtc1 $[[R3]], $f{{[0-9]+}}
+; MIPS64EB-DAG: dsrl $[[R4:[0-9]+]], $6, 32
+; MIPS64EB-DAG: sll $[[R5:[0-9]+]], $[[R4]], 0
+; MIPS64EB-DAG: mtc1 $[[R5:[0-9]+]], $f{{[0-9]+}}
+
+; MIPS64EL-DAG: dsrl $[[R1:[0-9]+]], $4, 32
+; MIPS64EL-DAG: sll $[[R2:[0-9]+]], $[[R1]], 0
+; MIPS64EL-DAG: mtc1 $[[R2:[0-9]+]], $f{{[0-9]+}}
+
+; MIPS64EL: sll $[[R6:[0-9]+]], $5, 0
+; MIPS64EL: andi $[[R7:[0-9]+]], $[[R6]], 255
+; MIPS64EL: mtc1 $[[R7]], $f[[F0:[0-9]+]]
+; MIPS64EL: cvt.s.w $f{{[0-9]+}}, $f[[F0]]
+
+; MIPS64EL-DAG: dsrl $[[R4:[0-9]+]], $6, 32
+; MIPS64EL-DAG: sll $[[R5:[0-9]+]], $[[R4]], 0
+; MIPS64EL-DAG: mtc1 $[[R5:[0-9]+]], $f{{[0-9]+}}
+
+; MIPS64EL-DAG: sll $[[R0:[0-9]+]], $4, 0
+; MIPS64EL-DAG: mtc1 $[[R0]], $f{{[0-9]+}}
+; MIPS64EL-DAG: sll $[[R3:[0-9]+]], $6, 0
+; MIPS64EL-DAG: mtc1 $[[R3]], $f{{[0-9]+}}
+
+; MIPS64R5: sll $[[R0:[0-9]+]], $5, 0
+; MIPS64R5: andi $[[R1:[0-9]+]], $[[R0]], 255
+; MIPS64R5: sd $4, {{[0-9]+}}($sp)
+; MIPS64R5: sd $6, {{[0-9]+}}($sp)
+
+  %0 = zext i8 %b to i32
+  %1 = uitofp i32 %0 to float
+  %2 = insertelement <2 x float> undef, float %1, i32 0
+  %3 = insertelement <2 x float> %2, float %1, i32 1
+  %4 = fadd <2 x float> %3, %a
+  %5 = fadd <2 x float> %4, %c
+  %6 = extractelement <2 x float> %5, i32 0
+  %7 = extractelement <2 x float> %5, i32 1
+  %8 = fadd float %6, %7
+  ret float %8
+}
+
+define <4 x float> @mixed_32(<4 x float> %a, i32 %b) {
+entry:
+; ALL-LABEL: mixed_32:
+
+; MIPS32-DAG: mtc1 $6, $f{{[0-9]+}}
+; MIPS32-DAG: mtc1 $7, $f{{[0-9]+}}
+; MIPS32-DAG: lwc1 $f{{[0-9]+}}, 28($sp)
+; MIPS32-DAG: lwc1 $f{{[0-9]+}}, 24($sp)
+; MIPS32-DAG: swc1 $f{{[0-9]+}}, 0($4)
+; MIPS32-DAG: swc1 $f{{[0-9]+}}, 4($4)
+; MIPS32-DAG: swc1 $f{{[0-9]+}}, 8($4)
+; MIPS32-DAG: swc1 $f{{[0-9]+}}, 12($4)
+
+; MIPS32R5: insert.w $w[[W0:[0-9]+]][0], $6
+; MIPS32R5: insert.w $w[[W0:[0-9]+]][1], $7
+; MIPS32R5: lw $[[R0:[0-9]+]], 16($sp)
+; MIPS32R5: insert.w $w[[W0:[0-9]+]][2], $[[R0]]
+; MIPS32R5: lw $[[R1:[0-9]+]], 20($sp)
+; MIPS32R5: insert.w $w[[W0:[0-9]+]][3], $[[R1]]
+; MIPS32R5: lw $[[R0:[0-9]+]], 24($sp)
+
+; MIPS64-DAG: sll ${{[0-9]+}}, $6, 0
+; MIPS64-DAG: dsrl $[[R0:[0-9]+]], $4, 32
+; MIPS64-DAG: sll $[[R1:[0-9]+]], $[[R0]], 0
+; MIPS64-DAG: mtc1 $[[R1]], $f{{[0-9]+}}
+; MIPS64-DAG: sll $[[R2:[0-9]+]], $4, 0
+; MIPS64-DAG: dsrl $[[R3:[0-9]+]], $5, 32
+; MIPS64-DAG: sll $[[R4:[0-9]+]], $[[R3]], 0
+; MIPS64-DAG: mtc1 $[[R4]], $f{{[0-9]+}}
+; MIPS64-DAG: mtc1 $[[R2]], $f{{[0-9]+}}
+; MIPS64-DAG: sll	$[[R6:[0-9]+]], $5, 0
+; MIPS64-DAG: mtc1 $[[R6:[0-9]+]], $f{{[0-9]+}}
+
+; MIPS64R5: insert.d $w[[W0:[0-9]+]][0], $4
+; MIPS64R5: insert.d $w[[W0]][1], $5
+; MIPS64R5: sll $[[R0:[0-9]+]], $6, 0
+; MIPS64R5: fill.w $w{{[0-9]+}}, $[[R0]]
+
+  %0 = uitofp i32 %b to float
+  %1 = insertelement <4 x float> undef, float %0, i32 0
+  %2 = insertelement <4 x float> %1, float %0, i32 1
+  %3 = insertelement <4 x float> %2, float %0, i32 2
+  %4 = insertelement <4 x float> %3, float %0, i32 3
+  %5 = fadd <4 x float> %4, %a
+  ret <4 x float> %5
+}
+
+
+; This test is slightly more fragile than I'd like as the offset into the
+; outgoing arguments area is dependant on the size of the stack frame for
+; this function.
+
+define <4 x float> @cast(<4 x i32> %a) {
+entry:
+; ALL-LABEL: cast:
+
+; MIPS32: addiu $sp, $sp, -32
+; MIPS32-DAG: sw $6, {{[0-9]+}}($sp)
+; MIPS32-DAG: sw $7, {{[0-9]+}}($sp)
+; MIPS32-DAG: lw ${{[0-9]+}}, 48($sp)
+; MIPS32-DAG: lw ${{[0-9]+}}, 52($sp)
+
+; MIPS32R5-DAG: insert.w  $w0[0], $6
+; MIPS32R5-DAG: insert.w  $w0[1], $7
+; MIPS32R5-DAG: lw  $[[R0:[0-9]+]], 16($sp)
+; MIPS32R5-DAG: insert.w  $w0[2], $[[R0]]
+; MIPS32R5-DAG: lw  $[[R1:[0-9]+]], 20($sp)
+; MIPS32R5-DAG: insert.w  $w0[3], $[[R1]]
+
+; MIPS64-DAG: sll ${{[0-9]+}}, $4, 0
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $4, 32
+; MIPS64-DAG: sll ${{[0-9]+}}, $5, 0
+; MIPS64-DAG: dsrl ${{[0-9]+}}, $5, 32
+
+; MIPS64R5-DAG: insert.d  $w0[0], $4
+; MIPS64R5-DAG: insert.d  $w0[1], $5
+
+  %0 = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %0
+}
+
+define <4 x float> @select(<4 x i32> %cond, <4 x float> %arg1, <4 x float> %arg2) {
+entry:
+; ALL-LABEL: select:
+
+; MIPS32-DAG: andi ${{[0-9]+}}, $7, 1
+; MIPS32-DAG: andi ${{[0-9]+}}, $6, 1
+; MIPS32-DAG: lw $[[R0:[0-9]+]], 16($sp)
+; MIPS32-DAG: andi ${{[0-9]+}}, $[[R0]], 1
+; MIPS32-DAG: lw $[[R1:[0-9]+]], 20($sp)
+; MIPS32-DAG: andi ${{[0-9]+}}, $[[R0]], 1
+
+; MIPS32R5-DAG: insert.w $w[[W0:[0-9]+]][0], $6
+; MIPS32R5-DAG: insert.w $w[[W0]][1], $7
+; MIPS32R5-DAG: lw $[[R0:[0-9]+]], 16($sp)
+; MIPS32R5-DAG: lw $[[R1:[0-9]+]], 20($sp)
+; MIPS32R5-DAG: insert.w $w[[W0]][2], $[[R0]]
+; MIPS32R5-DAG: insert.w $w[[W0]][3], $[[R1]]
+; MIPS32R5-DAG: slli.w $w{{[0-9]}}, $w[[W0]]
+
+; MIPS64-DAG: sll $[[R0:[0-9]+]], $6, 0
+; MIPS64-DAG: mtc1 $[[R0]], $f{{[0-9]+}}
+; MIPS64-DAG: dsrl $[[R1:[0-9]+]], $6, 32
+; MIPS64-DAG: sll $[[R2:[0-9]+]], $[[R1]], 0
+; MIPS64-DAG: mtc1 $[[R2]], $f{{[0-9]+}}
+
+; MIPS64-DAG: sll $[[R3:[0-9]+]], $7, 0
+; MIPS64-DAG: mtc1 $[[R3]], $f{{[0-9]+}}
+; MIPS64-DAG: dsrl $[[R4:[0-9]+]], $7, 32
+; MIPS64-DAG: sll $[[R5:[0-9]+]], $[[R4]], 0
+; MIPS64-DAG: mtc1 $[[R5]], $f{{[0-9]+}}
+
+; MIPS64-DAG: sll $[[R6:[0-9]+]], $8, 0
+; MIPS64-DAG: mtc1 $[[R6]], $f{{[0-9]+}}
+; MIPS64-DAG: dsrl $[[R7:[0-9]+]], $8, 32
+; MIPS64-DAG: sll $[[R8:[0-9]+]], $[[R7]], 0
+; MIPS64-DAG: mtc1 $[[R8]], $f{{[0-9]+}}
+
+; MIPS64-DAG: sll $[[R9:[0-9]+]], $9, 0
+; MIPS64-DAG: mtc1 $[[R9]], $f{{[0-9]+}}
+; MIPS64-DAG: dsrl $[[R10:[0-9]+]], $9, 32
+; MIPS64-DAG: sll $[[R11:[0-9]+]], $[[R10]], 0
+; MIPS64-DAG: mtc1 $[[R11]], $f{{[0-9]+}}
+
+; MIPS64-DAG: sll $[[R12:[0-9]+]], $4, 0
+; MIPS64-DAG: andi ${{[0-9]+}}, $[[R12]], 1
+; MIPS64-DAG: dsrl $[[R13:[0-9]+]], $4, 32
+; MIPS64-DAG: sll $[[R14:[0-9]+]], $[[R13]], 0
+; MIPS64-DAG: andi ${{[0-9]+}}, $[[R14]], 1
+
+; MIPS64-DAG: sll $[[R15:[0-9]+]], $5, 0
+; MIPS64-DAG: andi ${{[0-9]+}}, $[[R15]], 1
+; MIPS64-DAG: dsrl $[[R16:[0-9]+]], $5, 32
+; MIPS64-DAG: sll $[[R17:[0-9]+]], $[[R16]], 0
+; MIPS64-DAG: andi ${{[0-9]+}}, $[[R17]], 1
+
+; MIPS64R5-DAG: insert.d $w{{[0-9]+}}[0], $8
+; MIPS64R5-DAG: insert.d $w{{[0-9]+}}[1], $9
+; MIPS64R5-DAG: insert.d $w{{[0-9]+}}[0], $6
+; MIPS64R5-DAG: insert.d $w{{[0-9]+}}[1], $7
+; MIPS64R5-DAG: insert.d $w{{[0-9]+}}[0], $4
+; MIPS64R5-DAG: insert.d $w{{[0-9]+}}[1], $5
+
+  %cond.t = trunc <4 x i32> %cond to <4 x i1>
+  %res = select <4 x i1> %cond.t, <4 x float> %arg1, <4 x float> %arg2
+  ret <4 x float> %res
+}
diff --git a/test/CodeGen/Mips/ctlz-v.ll b/test/CodeGen/Mips/ctlz-v.ll
index 3d580e5771f..156c640681b 100644
--- a/test/CodeGen/Mips/ctlz-v.ll
+++ b/test/CodeGen/Mips/ctlz-v.ll
@@ -8,10 +8,14 @@ entry:
 ; MIPS32: clz     $2, $4
 ; MIPS32: clz     $3, $5
 
-; MIPS64-DAG: sll $[[A0:[0-9]+]], $4, 0
-; MIPS64-DAG: clz $2, $[[A0]]
-; MIPS64-DAG: sll $[[A1:[0-9]+]], $5, 0
-; MIPS64-DAG: clz $3, $[[A1]]
+; MIPS64-DAG: dsrl $[[A0:[0-9]+]], $4, 32
+; MIPS64-DAG: sll $[[A1:[0-9]+]], $[[A0]], 0
+; MIPS64-DAG: clz $[[R0:[0-9]+]], $[[A1]]
+; MIPS64-DAG: dsll $[[R1:[0-9]+]], $[[R0]], 32
+; MIPS64-DAG: sll $[[A2:[0-9]+]], $4, 0
+; MIPS64-DAG: clz $[[R2:[0-9]+]], $[[A2]]
+; MIPS64-DAG: dext $[[R3:[0-9]+]], $[[R2]], 0, 32
+; MIPS64-DAG: or $2, $[[R3]], $[[R1]]
 
   %ret = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %x, i1 true)
   ret <2 x i32> %ret
diff --git a/test/CodeGen/Mips/cttz-v.ll b/test/CodeGen/Mips/cttz-v.ll
index 85f69f9a17d..dbcde7f5fe5 100644
--- a/test/CodeGen/Mips/cttz-v.ll
+++ b/test/CodeGen/Mips/cttz-v.ll
@@ -24,14 +24,17 @@ entry:
 ; MIPS64-DAG: and     $[[R2:[0-9]+]], $[[R1]], $[[R0]]
 ; MIPS64-DAG: clz     $[[R3:[0-9]+]], $[[R2]]
 ; MIPS64-DAG: addiu   $[[R4:[0-9]+]], $zero, 32
-; MIPS64-DAG: subu    $2, $[[R4]], $[[R3]]
-; MIPS64-DAG: sll     $[[A1:[0-9]+]], $5, 0
-; MIPS64-DAG: addiu   $[[R5:[0-9]+]], $[[A1]], -1
-; MIPS64-DAG: not     $[[R6:[0-9]+]], $[[A1]]
-; MIPS64-DAG: and     $[[R7:[0-9]+]], $[[R6]], $[[R5]]
-; MIPS64-DAG: clz     $[[R8:[0-9]+]], $[[R7]]
-; MIPS64-DAG: jr      $ra
-; MIPS64-DAG: subu    $3, $[[R4]], $[[R8]]
+; MIPS64-DAG: subu    $[[R5:[0-9]+]], $[[R4]], $[[R3]]
+; MIPS64-DAG: dsrl    $[[R6:[0-9]+]], $4, 32
+; MIPS64-DAG: sll     $[[R7:[0-9]+]], $[[R6]], 0
+; MIPS64-DAG: dext    $[[R8:[0-9]+]], $[[R5]], 0, 32
+; MIPS64-DAG: addiu   $[[R9:[0-9]+]], $[[R7]], -1
+; MIPS64-DAG: not     $[[R10:[0-9]+]], $[[R7]]
+; MIPS64-DAG: and     $[[R11:[0-9]+]], $[[R10]], $[[R9]]
+; MIPS64-DAG: clz     $[[R12:[0-9]+]], $[[R11]]
+; MIPS64-DAG: subu    $[[R13:[0-9]+]], $[[R4]], $[[R12]]
+; MIPS64-DAG: dsll    $[[R14:[0-9]+]], $[[R13]], 32
+; MIPS64-DAG: or      $2, $[[R8]], $[[R14]]
 
   %ret = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %x, i1 true)
   ret <2 x i32> %ret
diff --git a/test/CodeGen/Mips/return-vector.ll b/test/CodeGen/Mips/return-vector.ll
index 08eddf37009..c59695d1873 100644
--- a/test/CodeGen/Mips/return-vector.ll
+++ b/test/CodeGen/Mips/return-vector.ll
@@ -128,8 +128,11 @@ entry:
 
 ; CHECK-LABEL:        call_f2:
 ; CHECK:        call16(f2)
-; CHECK-NOT:    lwc1
-; CHECK:        add.s    $[[R2:[a-z0-9]+]], $[[R0:[a-z0-9]+]], $[[R1:[a-z0-9]+]]
+; CHECK:        addiu $4, $sp, [[O0:[0-9]+]]
+; CHECK-DAG:    lwc1 $f[[F0:[0-9]]], [[O0]]($sp)
+; CHECK-DAG:    lwc1 $f[[F1:[0-9]]], 20($sp)
+; CHECK:        add.s    $f0, $f[[F0]], $f[[F1]]
+
 }
 
 
@@ -143,11 +146,12 @@ entry:
 
 ; CHECK-LABEL:        call_d2:
 ; CHECK:        call16(d2)
-; CHECK-NOT:    ldc1
-; CHECK:        add.d    $[[R2:[a-z0-9]+]], $[[R0:[a-z0-9]+]], $[[R1:[a-z0-9]+]]
-}
-
+; CHECK:        addiu $4, $sp, [[O0:[0-9]+]]
+; CHECK-DAG:    ldc1 $f[[F0:[0-9]+]], 24($sp)
+; CHECK-DAG:    ldc1 $f[[F1:[0-9]+]], [[O0]]($sp)
+; CHECK:        add.d    $f0, $f[[F1]], $f[[F0]]
 
+}
 
 ; Check that function returns vector on stack in cases when vector can't be
 ; returned in registers. Also check that vector is placed on stack starting
@@ -179,11 +183,12 @@ entry:
   ret <4 x float> %vecins4
 
 ; CHECK-LABEL:        return_f4:
-; CHECK-DAG:    lwc1    $[[R0:[a-z0-9]+]], 16($sp)
-; CHECK-DAG:    swc1    $[[R0]], 12($4)
+; CHECK-DAG:    lwc1    $f[[R0:[0-9]+]], 16($sp)
+; CHECK-DAG:    swc1    $f[[R0]], 12($4)
 ; CHECK-DAG:    sw      $7, 8($4)
 ; CHECK-DAG:    sw      $6, 4($4)
 ; CHECK-DAG:    sw      $5, 0($4)
+
 }
 
 
@@ -227,8 +232,8 @@ entry:
   ret <2 x float> %vecins2
 
 ; CHECK-LABEL:        return_f2:
-; CHECK:        mov.s   $f0, $f12
-; CHECK:        mov.s   $f2, $f14
+; CHECK-DAG:    sw   $5, 0($4)
+; CHECK-DAG:    sw   $6, 4($4)
 }
 
 
@@ -239,6 +244,10 @@ entry:
   ret <2 x double> %vecins2
 
 ; CHECK-LABEL:        return_d2:
-; CHECK:        mov.d   $f0, $f12
-; CHECK:        mov.d   $f2, $f14
+; CHECK-DAG:    ldc1 $f[[F0:[0-9]]], 16($sp)
+; CHECK-DAG:    sdc1 $f[[F0]], 8($4)
+; CHECK-DAG:    mtc1 $6, $f[[F1:[0-9]+]]
+; CHECK-DAG:    mtc1 $7, $f
+; CHECK-DAG:    sdc1 $f[[F0]], 0($4)
+
 }
-- 
2.50.1