From: Jiangning Liu <jiangning.liu@arm.com>
Date: Fri, 4 Oct 2013 09:21:17 +0000 (+0000)
Subject: Implement aarch64 neon instruction set AdvSIMD (3V elem).
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0aa1a88e19235574481e46e9e6e9ce66a9e6624f;p=clang

Implement aarch64 neon instruction set AdvSIMD (3V elem).


git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@191945 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/include/clang/Basic/arm_neon.td b/include/clang/Basic/arm_neon.td
index b59843a6ea..0b5508ac65 100644
--- a/include/clang/Basic/arm_neon.td
+++ b/include/clang/Basic/arm_neon.td
@@ -40,16 +40,25 @@ def OP_MLS_N : Op;
 def OP_MLAL_N : Op;
 def OP_MLSL_N : Op;
 def OP_MUL_LN: Op;
+def OP_MULX_LN: Op;
 def OP_MULL_LN : Op;
+def OP_MULLHi_LN : Op;
 def OP_MLA_LN: Op;
 def OP_MLS_LN: Op;
 def OP_MLAL_LN : Op;
+def OP_MLALHi_LN : Op;
 def OP_MLSL_LN : Op;
+def OP_MLSLHi_LN : Op;
 def OP_QDMULL_LN : Op;
+def OP_QDMULLHi_LN : Op;
 def OP_QDMLAL_LN : Op;
+def OP_QDMLALHi_LN : Op;
 def OP_QDMLSL_LN : Op;
+def OP_QDMLSLHi_LN : Op;
 def OP_QDMULH_LN : Op;
 def OP_QRDMULH_LN : Op;
+def OP_FMS_LN : Op;
+def OP_FMS_LNQ : Op;
 def OP_EQ    : Op;
 def OP_GE    : Op;
 def OP_LE    : Op;
@@ -146,6 +155,7 @@ class NoTestOpInst<string n, string p, string t, Op o> : Inst<n, p, t, o> {}
 // f: float (int args)
 // d: default
 // g: default, ignore 'Q' size modifier.
+// j: default, force 'Q' size modifier.
 // w: double width elements, same num elts
 // n: double width elements, half num elts
 // h: half width elements, double num elts
@@ -503,7 +513,7 @@ def MLS     : IOpInst<"vmls", "dddd", "csifUcUsUiQcQsQiQfQUcQUsQUiQd", OP_MLS>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Multiplication Extended
-def MULX : SInst<"vmulx", "ddd", "fQfQd">;
+def MULX : SInst<"vmulx", "ddd", "fdQfQd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Division
@@ -629,6 +639,63 @@ def VQDMULL_HIGH : SOpInst<"vqdmull_high", "wkk", "si", OP_QDMULLHi>;
 def VQDMLAL_HIGH : SOpInst<"vqdmlal_high", "wwkk", "si", OP_QDMLALHi>;
 def VQDMLSL_HIGH : SOpInst<"vqdmlsl_high", "wwkk", "si", OP_QDMLSLHi>;
 
+////////////////////////////////////////////////////////////////////////////////
+
+def VMLA_LANEQ   : IOpInst<"vmla_laneq", "dddji",
+                           "siUsUifQsQiQUsQUiQf", OP_MLA_LN>;
+def VMLS_LANEQ   : IOpInst<"vmls_laneq", "dddji",
+                           "siUsUifQsQiQUsQUiQf", OP_MLS_LN>;
+
+def VFMA_LANE    : IInst<"vfma_lane", "dddgi", "fdQfQd">;
+def VFMA_LANEQ   : IInst<"vfma_laneq", "dddji", "fdQfQd">;
+def VFMS_LANE    : IOpInst<"vfms_lane", "dddgi", "fdQfQd", OP_FMS_LN>;
+def VFMS_LANEQ   : IOpInst<"vfms_laneq", "dddji", "fdQfQd", OP_FMS_LNQ>;
+
+def VMLAL_LANEQ  : SOpInst<"vmlal_laneq", "wwdki", "siUsUi", OP_MLAL_LN>;
+def VMLAL_HIGH_LANE   : SOpInst<"vmlal_high_lane", "wwkdi", "siUsUi",
+                                OP_MLALHi_LN>;
+def VMLAL_HIGH_LANEQ  : SOpInst<"vmlal_high_laneq", "wwkki", "siUsUi",
+                                OP_MLALHi_LN>;
+def VMLSL_LANEQ  : SOpInst<"vmlsl_laneq", "wwdki", "siUsUi", OP_MLSL_LN>;
+def VMLSL_HIGH_LANE   : SOpInst<"vmlsl_high_lane", "wwkdi", "siUsUi",
+                                OP_MLSLHi_LN>;
+def VMLSL_HIGH_LANEQ  : SOpInst<"vmlsl_high_laneq", "wwkki", "siUsUi",
+                                OP_MLSLHi_LN>;
+
+def VQDMLAL_LANEQ  : SOpInst<"vqdmlal_laneq", "wwdki", "si", OP_QDMLAL_LN>;
+def VQDMLAL_HIGH_LANE   : SOpInst<"vqdmlal_high_lane", "wwkdi", "si",
+                                OP_QDMLALHi_LN>;
+def VQDMLAL_HIGH_LANEQ  : SOpInst<"vqdmlal_high_laneq", "wwkki", "si",
+                                OP_QDMLALHi_LN>;
+def VQDMLSL_LANEQ  : SOpInst<"vqdmlsl_laneq", "wwdki", "si", OP_QDMLSL_LN>;
+def VQDMLSL_HIGH_LANE   : SOpInst<"vqdmlsl_high_lane", "wwkdi", "si",
+                                OP_QDMLSLHi_LN>;
+def VQDMLSL_HIGH_LANEQ  : SOpInst<"vqdmlsl_high_laneq", "wwkki", "si",
+                                OP_QDMLSLHi_LN>;
+
+// Newly add double parameter for vmul_lane in aarch64
+def VMUL_LANE_A64 : IOpInst<"vmul_lane", "ddgi", "dQd", OP_MUL_LN>;
+
+def VMUL_LANEQ   : IOpInst<"vmul_laneq", "ddji",
+                           "sifdUsUiQsQiQfQUsQUiQfQd", OP_MUL_LN>;
+def VMULL_LANEQ  : SOpInst<"vmull_laneq", "wdki", "siUsUi", OP_MULL_LN>;
+def VMULL_HIGH_LANE   : SOpInst<"vmull_high_lane", "wkdi", "siUsUi",
+                                OP_MULLHi_LN>;
+def VMULL_HIGH_LANEQ  : SOpInst<"vmull_high_laneq", "wkki", "siUsUi",
+                                OP_MULLHi_LN>;
+
+def VQDMULL_LANEQ  : SOpInst<"vqdmull_laneq", "wdki", "si", OP_QDMULL_LN>;
+def VQDMULL_HIGH_LANE   : SOpInst<"vqdmull_high_lane", "wkdi", "si",
+                                  OP_QDMULLHi_LN>;
+def VQDMULL_HIGH_LANEQ  : SOpInst<"vqdmull_high_laneq", "wkki", "si",
+                                  OP_QDMULLHi_LN>;
+
+def VQDMULH_LANEQ  : SOpInst<"vqdmulh_laneq", "ddji", "siQsQi", OP_QDMULH_LN>;
+def VQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "ddji", "siQsQi", OP_QRDMULH_LN>;
+
+def VMULX_LANE : IOpInst<"vmulx_lane", "ddgi", "fdQfQd", OP_MULX_LN>;
+def VMULX_LANEQ : IOpInst<"vmulx_laneq", "ddji", "fdQfQd", OP_MULX_LN>;
+
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Arithmetic
 
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index a6911bab34..b4caab2297 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -2222,6 +2222,46 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   }
 
   // AArch64-only builtins
+  case AArch64::BI__builtin_neon_vfma_lane_v:
+  case AArch64::BI__builtin_neon_vfmaq_laneq_v: {
+    Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+
+    Ops[2] = Builder.CreateBitCast(Ops[2], Ty);
+    Ops[2] = EmitNeonSplat(Ops[2], cast<ConstantInt>(Ops[3]));
+    return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
+  }
+  case AArch64::BI__builtin_neon_vfmaq_lane_v: {
+    Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+
+    llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
+    llvm::Type *STy = llvm::VectorType::get(VTy->getElementType(),
+                                            VTy->getNumElements() / 2);
+    Ops[2] = Builder.CreateBitCast(Ops[2], STy);
+    Value* SV = llvm::ConstantVector::getSplat(VTy->getNumElements(),
+                                               cast<ConstantInt>(Ops[3]));
+    Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane");
+
+    return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
+  }
+  case AArch64::BI__builtin_neon_vfma_laneq_v: {
+    Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
+    Ops[0] = Builder.CreateBitCast(Ops[0], Ty);
+    Ops[1] = Builder.CreateBitCast(Ops[1], Ty);
+
+    llvm::VectorType *VTy = cast<llvm::VectorType>(Ty);
+    llvm::Type *STy = llvm::VectorType::get(VTy->getElementType(),
+                                            VTy->getNumElements() * 2);
+    Ops[2] = Builder.CreateBitCast(Ops[2], STy);
+    Value* SV = llvm::ConstantVector::getSplat(VTy->getNumElements(),
+                                               cast<ConstantInt>(Ops[3]));
+    Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane");
+
+    return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]);
+  }
   case AArch64::BI__builtin_neon_vfms_v:
   case AArch64::BI__builtin_neon_vfmsq_v: {
     Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty);
diff --git a/test/CodeGen/aarch64-neon-2velem.c b/test/CodeGen/aarch64-neon-2velem.c
new file mode 100644
index 0000000000..f34e11a3ce
--- /dev/null
+++ b/test/CodeGen/aarch64-neon-2velem.c
@@ -0,0 +1,802 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
+// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
+// RUN:   -S -O3 -o - %s | FileCheck %s
+
+// Test new aarch64 intrinsics and types
+
+#include <arm_neon.h>
+
+int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
+  // CHECK: test_vmla_lane_s16
+  return vmla_lane_s16(a, b, v, 1);
+  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
+  // CHECK: test_vmlaq_lane_s16
+  return vmlaq_lane_s16(a, b, v, 1);
+  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
+  // CHECK: test_vmla_lane_s32
+  return vmla_lane_s32(a, b, v, 1);
+  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
+  // CHECK: test_vmlaq_lane_s32
+  return vmlaq_lane_s32(a, b, v, 1);
+  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
+  // CHECK: test_vmla_laneq_s16
+  return vmla_laneq_s16(a, b, v, 1);
+  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
+  // CHECK: test_vmlaq_laneq_s16
+  return vmlaq_laneq_s16(a, b, v, 1);
+  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
+  // CHECK: test_vmla_laneq_s32
+  return vmla_laneq_s32(a, b, v, 1);
+  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
+  // CHECK: test_vmlaq_laneq_s32
+  return vmlaq_laneq_s32(a, b, v, 1);
+  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
+  // CHECK: test_vmls_lane_s16
+  return vmls_lane_s16(a, b, v, 1);
+  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
+  // CHECK: test_vmlsq_lane_s16
+  return vmlsq_lane_s16(a, b, v, 1);
+  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
+  // CHECK: test_vmls_lane_s32
+  return vmls_lane_s32(a, b, v, 1);
+  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
+  // CHECK: test_vmlsq_lane_s32
+  return vmlsq_lane_s32(a, b, v, 1);
+  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
+  // CHECK: test_vmls_laneq_s16
+  return vmls_laneq_s16(a, b, v, 1);
+  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
+  // CHECK: test_vmlsq_laneq_s16
+  return vmlsq_laneq_s16(a, b, v, 1);
+  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
+  // CHECK: test_vmls_laneq_s32
+  return vmls_laneq_s32(a, b, v, 1);
+  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
+  // CHECK: test_vmlsq_laneq_s32
+  return vmlsq_laneq_s32(a, b, v, 1);
+  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
+  // CHECK: test_vmul_lane_s16
+  return vmul_lane_s16(a, v, 1);
+  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
+  // CHECK: test_vmulq_lane_s16
+  return vmulq_lane_s16(a, v, 1);
+  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
+  // CHECK: test_vmul_lane_s32
+  return vmul_lane_s32(a, v, 1);
+  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
+  // CHECK: test_vmulq_lane_s32
+  return vmulq_lane_s32(a, v, 1);
+  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
+  // CHECK: test_vmul_lane_u16
+  return vmul_lane_u16(a, v, 1);
+  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
+  // CHECK: test_vmulq_lane_u16
+  return vmulq_lane_u16(a, v, 1);
+  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
+  // CHECK: test_vmul_lane_u32
+  return vmul_lane_u32(a, v, 1);
+  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
+  // CHECK: test_vmulq_lane_u32
+  return vmulq_lane_u32(a, v, 1);
+  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
+  // CHECK: test_vmul_laneq_s16
+  return vmul_laneq_s16(a, v, 1);
+  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
+  // CHECK: test_vmulq_laneq_s16
+  return vmulq_laneq_s16(a, v, 1);
+  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
+  // CHECK: test_vmul_laneq_s32
+  return vmul_laneq_s32(a, v, 1);
+  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
+  // CHECK: test_vmulq_laneq_s32
+  return vmulq_laneq_s32(a, v, 1);
+  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
+  // CHECK: test_vmul_laneq_u16
+  return vmul_laneq_u16(a, v, 1);
+  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
+  // CHECK: test_vmulq_laneq_u16
+  return vmulq_laneq_u16(a, v, 1);
+  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
+  // CHECK: test_vmul_laneq_u32
+  return vmul_laneq_u32(a, v, 1);
+  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
+  // CHECK: test_vmulq_laneq_u32
+  return vmulq_laneq_u32(a, v, 1);
+  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
+  // CHECK: test_vfma_lane_f32
+  return vfma_lane_f32(a, b, v, 1);
+  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
+  // CHECK: test_vfmaq_lane_f32
+  return vfmaq_lane_f32(a, b, v, 1);
+  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
+  // CHECK: test_vfma_laneq_f32
+  return vfma_laneq_f32(a, b, v, 1);
+  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
+  // CHECK: test_vfmaq_laneq_f32
+  return vfmaq_laneq_f32(a, b, v, 1);
+  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
+  // CHECK: test_vfms_lane_f32
+  return vfms_lane_f32(a, b, v, 1);
+  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
+  // CHECK: test_vfmsq_lane_f32
+  return vfmsq_lane_f32(a, b, v, 1);
+  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
+  // CHECK: test_vfms_laneq_f32
+  return vfms_laneq_f32(a, b, v, 1);
+  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
+  // CHECK: test_vfmsq_laneq_f32
+  return vfmsq_laneq_f32(a, b, v, 1);
+  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
+  // CHECK: test_vfmaq_lane_f64
+  return vfmaq_lane_f64(a, b, v, 0);
+  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+}
+
+float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
+  // CHECK: test_vfmaq_laneq_f64
+  return vfmaq_laneq_f64(a, b, v, 0);
+  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+}
+
+float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
+  // CHECK: test_vfmaq_laneq_f64
+  return vfmaq_laneq_f64(a, b, v, 1);
+  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+}
+
+float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
+  // CHECK: test_vfmsq_lane_f64
+  return vfmsq_lane_f64(a, b, v, 0);
+  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+}
+
+float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
+  // CHECK: test_vfmsq_laneq_f64
+  return vfmsq_laneq_f64(a, b, v, 0);
+  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+}
+
+float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
+  // CHECK: test_vfmsq_laneq_f64
+  return vfmsq_laneq_f64(a, b, v, 1);
+  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+}
+
+int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
+  // CHECK: test_vmlal_lane_s16
+  return vmlal_lane_s16(a, b, v, 1);
+  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
+  // CHECK: test_vmlal_lane_s32
+  return vmlal_lane_s32(a, b, v, 1);
+  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
+  // CHECK: test_vmlal_laneq_s16
+  return vmlal_laneq_s16(a, b, v, 1);
+  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
+  // CHECK: test_vmlal_laneq_s32
+  return vmlal_laneq_s32(a, b, v, 1);
+  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
+  // CHECK: test_vmlal_high_lane_s16
+  return vmlal_high_lane_s16(a, b, v, 1);
+  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
+  // CHECK: test_vmlal_high_lane_s32
+  return vmlal_high_lane_s32(a, b, v, 1);
+  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
+  // CHECK: test_vmlal_high_laneq_s16
+  return vmlal_high_laneq_s16(a, b, v, 1);
+  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
+  // CHECK: test_vmlal_high_laneq_s32
+  return vmlal_high_laneq_s32(a, b, v, 1);
+  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
+  // CHECK: test_vmlsl_lane_s16
+  return vmlsl_lane_s16(a, b, v, 1);
+  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
+  // CHECK: test_vmlsl_lane_s32
+  return vmlsl_lane_s32(a, b, v, 1);
+  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
+  // CHECK: test_vmlsl_laneq_s16
+  return vmlsl_laneq_s16(a, b, v, 1);
+  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
+  // CHECK: test_vmlsl_laneq_s32
+  return vmlsl_laneq_s32(a, b, v, 1);
+  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
+  // CHECK: test_vmlsl_high_lane_s16
+  return vmlsl_high_lane_s16(a, b, v, 1);
+  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
+  // CHECK: test_vmlsl_high_lane_s32
+  return vmlsl_high_lane_s32(a, b, v, 1);
+  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
+  // CHECK: test_vmlsl_high_laneq_s16
+  return vmlsl_high_laneq_s16(a, b, v, 1);
+  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
+  // CHECK: test_vmlsl_high_laneq_s32
+  return vmlsl_high_laneq_s32(a, b, v, 1);
+  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
+  // CHECK: test_vmlal_lane_u16
+  return vmlal_lane_u16(a, b, v, 1);
+  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
+  // CHECK: test_vmlal_lane_u32
+  return vmlal_lane_u32(a, b, v, 1);
+  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
+  // CHECK: test_vmlal_laneq_u16
+  return vmlal_laneq_u16(a, b, v, 1);
+  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
+  // CHECK: test_vmlal_laneq_u32
+  return vmlal_laneq_u32(a, b, v, 1);
+  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
+  // CHECK: test_vmlal_high_lane_u16
+  return vmlal_high_lane_u16(a, b, v, 1);
+  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
+  // CHECK: test_vmlal_high_lane_u32
+  return vmlal_high_lane_u32(a, b, v, 1);
+  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
+  // CHECK: test_vmlal_high_laneq_u16
+  return vmlal_high_laneq_u16(a, b, v, 1);
+  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
+  // CHECK: test_vmlal_high_laneq_u32
+  return vmlal_high_laneq_u32(a, b, v, 1);
+  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
+  // CHECK: test_vmlsl_lane_u16
+  return vmlsl_lane_u16(a, b, v, 1);
+  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
+  // CHECK: test_vmlsl_lane_u32
+  return vmlsl_lane_u32(a, b, v, 1);
+  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
+  // CHECK: test_vmlsl_laneq_u16
+  return vmlsl_laneq_u16(a, b, v, 1);
+  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
+  // CHECK: test_vmlsl_laneq_u32
+  return vmlsl_laneq_u32(a, b, v, 1);
+  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
+  // CHECK: test_vmlsl_high_lane_u16
+  return vmlsl_high_lane_u16(a, b, v, 1);
+  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
+  // CHECK: test_vmlsl_high_lane_u32
+  return vmlsl_high_lane_u32(a, b, v, 1);
+  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
+  // CHECK: test_vmlsl_high_laneq_u16
+  return vmlsl_high_laneq_u16(a, b, v, 1);
+  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
+  // CHECK: test_vmlsl_high_laneq_u32
+  return vmlsl_high_laneq_u32(a, b, v, 1);
+  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
+  // CHECK: test_vmull_lane_s16
+  return vmull_lane_s16(a, v, 1);
+  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
+  // CHECK: test_vmull_lane_s32
+  return vmull_lane_s32(a, v, 1);
+  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
+  // CHECK: test_vmull_lane_u16
+  return vmull_lane_u16(a, v, 1);
+  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
+  // CHECK: test_vmull_lane_u32
+  return vmull_lane_u32(a, v, 1);
+  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
+  // CHECK: test_vmull_high_lane_s16
+  return vmull_high_lane_s16(a, v, 1);
+  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
+  // CHECK: test_vmull_high_lane_s32
+  return vmull_high_lane_s32(a, v, 1);
+  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
+  // CHECK: test_vmull_high_lane_u16
+  return vmull_high_lane_u16(a, v, 1);
+  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
+  // CHECK: test_vmull_high_lane_u32
+  return vmull_high_lane_u32(a, v, 1);
+  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
+  // CHECK: test_vmull_laneq_s16
+  return vmull_laneq_s16(a, v, 1);
+  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
+  // CHECK: test_vmull_laneq_s32
+  return vmull_laneq_s32(a, v, 1);
+  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
+  // CHECK: test_vmull_laneq_u16
+  return vmull_laneq_u16(a, v, 1);
+  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
+  // CHECK: test_vmull_laneq_u32
+  return vmull_laneq_u32(a, v, 1);
+  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
+  // CHECK: test_vmull_high_laneq_s16
+  return vmull_high_laneq_s16(a, v, 1);
+  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
+  // CHECK: test_vmull_high_laneq_s32
+  return vmull_high_laneq_s32(a, v, 1);
+  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
+  // CHECK: test_vmull_high_laneq_u16
+  return vmull_high_laneq_u16(a, v, 1);
+  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
+  // CHECK: test_vmull_high_laneq_u32
+  return vmull_high_laneq_u32(a, v, 1);
+  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
+  // CHECK: test_vqdmlal_lane_s16
+  return vqdmlal_lane_s16(a, b, v, 1);
+  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
+  // CHECK: test_vqdmlal_lane_s32
+  return vqdmlal_lane_s32(a, b, v, 1);
+  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
+  // CHECK: test_vqdmlal_high_lane_s16
+  return vqdmlal_high_lane_s16(a, b, v, 1);
+  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
+  // CHECK: test_vqdmlal_high_lane_s32
+  return vqdmlal_high_lane_s32(a, b, v, 1);
+  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
+  // CHECK: test_vqdmlsl_lane_s16
+  return vqdmlsl_lane_s16(a, b, v, 1);
+  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
+  // CHECK: test_vqdmlsl_lane_s32
+  return vqdmlsl_lane_s32(a, b, v, 1);
+  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
+  // CHECK: test_vqdmlsl_high_lane_s16
+  return vqdmlsl_high_lane_s16(a, b, v, 1);
+  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
+  // CHECK: test_vqdmlsl_high_lane_s32
+  return vqdmlsl_high_lane_s32(a, b, v, 1);
+  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
+  // CHECK: test_vqdmull_lane_s16
+  return vqdmull_lane_s16(a, v, 1);
+  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
+  // CHECK: test_vqdmull_lane_s32
+  return vqdmull_lane_s32(a, v, 1);
+  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
+  // CHECK: test_vqdmull_laneq_s16
+  return vqdmull_laneq_s16(a, v, 1);
+  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
+  // CHECK: test_vqdmull_laneq_s32
+  return vqdmull_laneq_s32(a, v, 1);
+  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
+  // CHECK: test_vqdmull_high_lane_s16
+  return vqdmull_high_lane_s16(a, v, 1);
+  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
+  // CHECK: test_vqdmull_high_lane_s32
+  return vqdmull_high_lane_s32(a, v, 1);
+  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
+  // CHECK: test_vqdmull_high_laneq_s16
+  return vqdmull_high_laneq_s16(a, v, 1);
+  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
+  // CHECK: test_vqdmull_high_laneq_s32
+  return vqdmull_high_laneq_s32(a, v, 1);
+  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
+  // CHECK: test_vqdmulh_lane_s16
+  return vqdmulh_lane_s16(a, v, 1);
+  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
+  // CHECK: test_vqdmulhq_lane_s16
+  return vqdmulhq_lane_s16(a, v, 1);
+  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
+  // CHECK: test_vqdmulh_lane_s32
+  return vqdmulh_lane_s32(a, v, 1);
+  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
+  // CHECK: test_vqdmulhq_lane_s32
+  return vqdmulhq_lane_s32(a, v, 1);
+  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
+  // CHECK: test_vqrdmulh_lane_s16
+  return vqrdmulh_lane_s16(a, v, 1);
+  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1]
+}
+
+int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
+  // CHECK: test_vqrdmulhq_lane_s16
+  return vqrdmulhq_lane_s16(a, v, 1);
+  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1]
+}
+
+int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
+  // CHECK: test_vqrdmulh_lane_s32
+  return vqrdmulh_lane_s32(a, v, 1);
+  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
+  // CHECK: test_vqrdmulhq_lane_s32
+  return vqrdmulhq_lane_s32(a, v, 1);
+  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
+  // CHECK: test_vmul_lane_f32
+  return vmul_lane_f32(a, v, 1);
+  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
+  // CHECK: test_vmulq_lane_f32
+  return vmulq_lane_f32(a, v, 1);
+  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
+  // CHECK: test_vmulq_lane_f64
+  return vmulq_lane_f64(a, v, 0);
+  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+}
+
+float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
+  // CHECK: test_vmul_laneq_f32
+  return vmul_laneq_f32(a, v, 1);
+  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
+  // CHECK: test_vmulq_laneq_f32
+  return vmulq_laneq_f32(a, v, 1);
+  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
+  // CHECK: test_vmulq_laneq_f64
+  return vmulq_laneq_f64(a, v, 0);
+  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+}
+
+float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
+  // CHECK: test_vmulq_laneq_f64
+  return vmulq_laneq_f64(a, v, 1);
+  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+}
+
+float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
+  // CHECK: test_vmulx_lane_f32
+  return vmulx_lane_f32(a, v, 1);
+  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
+  // CHECK: test_vmulxq_lane_f32
+  return vmulxq_lane_f32(a, v, 1);
+  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
+  // CHECK: test_vmulxq_lane_f64
+  return vmulxq_lane_f64(a, v, 0);
+  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+}
+
+float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
+  // CHECK: test_vmulx_laneq_f32
+  return vmulx_laneq_f32(a, v, 1);
+  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+}
+
+float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
+  // CHECK: test_vmulxq_laneq_f32
+  return vmulxq_laneq_f32(a, v, 1);
+  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+}
+
+float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
+  // CHECK: test_vmulxq_laneq_f64
+  return vmulxq_laneq_f64(a, v, 0);
+  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+}
+
+float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
+  // CHECK: test_vmulxq_laneq_f64
+  return vmulxq_laneq_f64(a, v, 1);
+  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+}
+
diff --git a/utils/TableGen/NeonEmitter.cpp b/utils/TableGen/NeonEmitter.cpp
index f700c6753a..9dc2d56f6c 100644
--- a/utils/TableGen/NeonEmitter.cpp
+++ b/utils/TableGen/NeonEmitter.cpp
@@ -62,16 +62,25 @@ enum OpKind {
   OpMlalN,
   OpMlslN,
   OpMulLane,
+  OpMulXLane,
   OpMullLane,
+  OpMullHiLane,
   OpMlaLane,
   OpMlsLane,
   OpMlalLane,
+  OpMlalHiLane,
   OpMlslLane,
+  OpMlslHiLane,
   OpQDMullLane,
+  OpQDMullHiLane,
   OpQDMlalLane,
+  OpQDMlalHiLane,
   OpQDMlslLane,
+  OpQDMlslHiLane,
   OpQDMulhLane,
   OpQRDMulhLane,
+  OpFMSLane,
+  OpFMSLaneQ,
   OpEq,
   OpGe,
   OpLe,
@@ -197,16 +206,25 @@ public:
     OpMap["OP_MLAL_N"] = OpMlalN;
     OpMap["OP_MLSL_N"] = OpMlslN;
     OpMap["OP_MUL_LN"]= OpMulLane;
+    OpMap["OP_MULX_LN"]= OpMulXLane;
     OpMap["OP_MULL_LN"] = OpMullLane;
+    OpMap["OP_MULLHi_LN"] = OpMullHiLane;
     OpMap["OP_MLA_LN"]= OpMlaLane;
     OpMap["OP_MLS_LN"]= OpMlsLane;
     OpMap["OP_MLAL_LN"] = OpMlalLane;
+    OpMap["OP_MLALHi_LN"] = OpMlalHiLane;
     OpMap["OP_MLSL_LN"] = OpMlslLane;
+    OpMap["OP_MLSLHi_LN"] = OpMlslHiLane;
     OpMap["OP_QDMULL_LN"] = OpQDMullLane;
+    OpMap["OP_QDMULLHi_LN"] = OpQDMullHiLane;
     OpMap["OP_QDMLAL_LN"] = OpQDMlalLane;
+    OpMap["OP_QDMLALHi_LN"] = OpQDMlalHiLane;
     OpMap["OP_QDMLSL_LN"] = OpQDMlslLane;
+    OpMap["OP_QDMLSLHi_LN"] = OpQDMlslHiLane;
     OpMap["OP_QDMULH_LN"] = OpQDMulhLane;
     OpMap["OP_QRDMULH_LN"] = OpQRDMulhLane;
+    OpMap["OP_FMS_LN"] = OpFMSLane;
+    OpMap["OP_FMS_LNQ"] = OpFMSLaneQ;
     OpMap["OP_EQ"]    = OpEq;
     OpMap["OP_GE"]    = OpGe;
     OpMap["OP_LE"]    = OpLe;
@@ -447,6 +465,9 @@ static char ModType(const char mod, char type, bool &quad, bool &poly,
     case 'g':
       quad = false;
       break;
+    case 'j':
+      quad = true;
+      break;
     case 'w':
       type = Widen(type);
       quad = true;
@@ -626,7 +647,8 @@ static std::string BuiltinTypeString(const char mod, StringRef typestr,
     type = 's';
     usgn = true;
   }
-  usgn = usgn | poly | ((ck == ClassI || ck == ClassW) && scal && type != 'f');
+  usgn = usgn | poly | ((ck == ClassI || ck == ClassW) &&
+                         scal && type != 'f' && type != 'd');
 
   if (scal) {
     SmallString<128> s;
@@ -657,6 +679,8 @@ static std::string BuiltinTypeString(const char mod, StringRef typestr,
       return "vv*"; // void result with void* first argument
     if (mod == 'f' || (ck != ClassB && type == 'f'))
       return quad ? "V4f" : "V2f";
+    if (ck != ClassB && type == 'd')
+      return quad ? "V2d" : "V1d";
     if (ck != ClassB && type == 's')
       return quad ? "V8s" : "V4s";
     if (ck != ClassB && type == 'i')
@@ -677,6 +701,8 @@ static std::string BuiltinTypeString(const char mod, StringRef typestr,
 
   if (mod == 'f' || (ck != ClassB && type == 'f'))
     return quad ? "V4f" : "V2f";
+  if (ck != ClassB && type == 'd')
+    return quad ? "V2d" : "V1d";
   if (ck != ClassB && type == 's')
     return quad ? "V8s" : "V4s";
   if (ck != ClassB && type == 'i')
@@ -974,6 +1000,7 @@ static void NormalizeProtoForRegisterPatternCreation(const std::string &Name,
       NormedProto += 'q';
       break;
     case 'g':
+    case 'j':
     case 'h':
     case 'e':
       NormedProto += 'd';
@@ -1504,6 +1531,10 @@ static std::string GenOpString(const std::string &name, OpKind op,
   case OpMulLane:
     s += "__a * " + SplatLane(nElts, "__b", "__c") + ";";
     break;
+  case OpMulXLane:
+    s += MangleName("vmulx", typestr, ClassS) + "(__a, " +
+      SplatLane(nElts, "__b", "__c") + ");";
+    break;
   case OpMul:
     s += "__a * __b;";
     break;
@@ -1511,6 +1542,10 @@ static std::string GenOpString(const std::string &name, OpKind op,
     s += MangleName("vmull", typestr, ClassS) + "(__a, " +
       SplatLane(nElts, "__b", "__c") + ");";
     break;
+  case OpMullHiLane:
+    s += MangleName("vmull", typestr, ClassS) + "(" +
+      GetHigh("__a", typestr) + ", " + SplatLane(nElts, "__b", "__c") + ");";
+    break;
   case OpMlaN:
     s += "__a + (__b * " + Duplicate(nElts, typestr, "__c") + ");";
     break;
@@ -1528,6 +1563,10 @@ static std::string GenOpString(const std::string &name, OpKind op,
     s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, " +
       SplatLane(nElts, "__c", "__d") + ");";
     break;
+  case OpMlalHiLane:
+    s += "__a + " + MangleName("vmull", typestr, ClassS) + "(" +
+      GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
+    break;
   case OpMlal:
     s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, __c);";
     break;
@@ -1543,6 +1582,18 @@ static std::string GenOpString(const std::string &name, OpKind op,
   case OpMlsLane:
     s += "__a - (__b * " + SplatLane(nElts, "__c", "__d") + ");";
     break;
+  case OpFMSLane:
+    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
+    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
+    s += TypeString(proto[3], typestr) + " __c1 = __c; \\\n  ";
+    s += MangleName("vfma_lane", typestr, ClassS) + "(__a1, __b1, -__c1, __d);";
+    break;
+  case OpFMSLaneQ:
+    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
+    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
+    s += TypeString(proto[3], typestr) + " __c1 = __c; \\\n  ";
+    s += MangleName("vfma_laneq", typestr, ClassS) + "(__a1, __b1, -__c1, __d);";
+    break;
   case OpMls:
     s += "__a - (__b * __c);";
     break;
@@ -1554,6 +1605,10 @@ static std::string GenOpString(const std::string &name, OpKind op,
     s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, " +
       SplatLane(nElts, "__c", "__d") + ");";
     break;
+  case OpMlslHiLane:
+    s += "__a - " + MangleName("vmull", typestr, ClassS) + "(" +
+      GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
+    break;
   case OpMlsl:
     s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, __c);";
     break;
@@ -1564,14 +1619,26 @@ static std::string GenOpString(const std::string &name, OpKind op,
     s += MangleName("vqdmull", typestr, ClassS) + "(__a, " +
       SplatLane(nElts, "__b", "__c") + ");";
     break;
+  case OpQDMullHiLane:
+    s += MangleName("vqdmull", typestr, ClassS) + "(" +
+      GetHigh("__a", typestr) + ", " + SplatLane(nElts, "__b", "__c") + ");";
+    break;
   case OpQDMlalLane:
     s += MangleName("vqdmlal", typestr, ClassS) + "(__a, __b, " +
       SplatLane(nElts, "__c", "__d") + ");";
     break;
+  case OpQDMlalHiLane:
+    s += MangleName("vqdmlal", typestr, ClassS) + "(__a, " +
+      GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
+    break;
   case OpQDMlslLane:
     s += MangleName("vqdmlsl", typestr, ClassS) + "(__a, __b, " +
       SplatLane(nElts, "__c", "__d") + ");";
     break;
+  case OpQDMlslHiLane:
+    s += MangleName("vqdmlsl", typestr, ClassS) + "(__a, " +
+      GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
+    break;
   case OpQDMulhLane:
     s += MangleName("vqdmulh", typestr, ClassS) + "(__a, " +
       SplatLane(nElts, "__b", "__c") + ");";
@@ -2072,20 +2139,28 @@ void NeonEmitter::run(raw_ostream &OS) {
 
   // Emit Neon vector typedefs.
   std::string TypedefTypes(
-      "cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfQdPcQPcPsQPs");
+      "cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfdQdPcQPcPsQPs");
   SmallVector<StringRef, 24> TDTypeVec;
   ParseTypes(0, TypedefTypes, TDTypeVec);
 
   // Emit vector typedefs.
+  bool isA64 = false;
   for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) {
     bool dummy, quad = false, poly = false;
     char type = ClassifyType(TDTypeVec[i], quad, poly, dummy);
-    bool isA64 = false;
+    bool preinsert = false;
+    bool postinsert = false;
 
-    if (type == 'd' && quad)
+    if (type == 'd') {
+      preinsert = isA64? false: true;
       isA64 = true;
-
-    if (isA64)
+    } else {
+      postinsert = isA64? true: false;
+      isA64 = false;
+    }
+    if (postinsert)
+      OS << "#endif\n";
+    if (preinsert)
       OS << "#ifdef __aarch64__\n";
 
     if (poly)
@@ -2101,22 +2176,28 @@ void NeonEmitter::run(raw_ostream &OS) {
     OS << TypeString('s', TDTypeVec[i]);
     OS << " " << TypeString('d', TDTypeVec[i]) << ";\n";
 
-    if (isA64)
-      OS << "#endif\n";
   }
   OS << "\n";
 
   // Emit struct typedefs.
+  isA64 = false;
   for (unsigned vi = 2; vi != 5; ++vi) {
     for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) {
       bool dummy, quad = false, poly = false;
       char type = ClassifyType(TDTypeVec[i], quad, poly, dummy);
-      bool isA64 = false;
+      bool preinsert = false;
+      bool postinsert = false;
 
-      if (type == 'd' && quad)
+      if (type == 'd') {
+        preinsert = isA64? false: true;
         isA64 = true;
-
-      if (isA64)
+      } else {
+        postinsert = isA64? true: false;
+        isA64 = false;
+      }
+      if (postinsert)
+        OS << "#endif\n";
+      if (preinsert)
         OS << "#ifdef __aarch64__\n";
 
       std::string ts = TypeString('d', TDTypeVec[i]);
@@ -2126,10 +2207,6 @@ void NeonEmitter::run(raw_ostream &OS) {
       OS << "[" << utostr(vi) << "]";
       OS << ";\n} ";
       OS << vs << ";\n";
-
-      if (isA64)
-        OS << "#endif\n";
-
       OS << "\n";
     }
   }
@@ -2255,6 +2332,7 @@ static unsigned RangeFromType(const char mod, StringRef typestr) {
     case 'f':
     case 'i':
       return (2 << (int)quad) - 1;
+    case 'd':
     case 'l':
       return (1 << (int)quad) - 1;
     default: