]> granicus.if.org Git - clang/commitdiff
Implement aarch64 neon instruction set AdvSIMD (3V Diff), covering the following...
authorJiangning Liu <jiangning.liu@arm.com>
Mon, 9 Sep 2013 02:21:08 +0000 (02:21 +0000)
committerJiangning Liu <jiangning.liu@arm.com>
Mon, 9 Sep 2013 02:21:08 +0000 (02:21 +0000)
SADDL, UADDL, SADDW, UADDW, SSUBL, USUBL, SSUBW, USUBW, ADDHN, RADDHN, SABAL, UABAL, SUBHN, RSUBHN, SABDL, UABDL, SMLAL, UMLAL, SMLSL, UMLSL, SQDMLAL, SQDMLSL, SMULL, UMULL, SQDMULL, PMULL

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@190289 91177308-0d34-0410-b5e6-96231b3b80d8

include/clang/Basic/arm_neon.td
lib/CodeGen/CGBuiltin.cpp
test/CodeGen/aarch64-neon-intrinsics.c
utils/TableGen/NeonEmitter.cpp

index 1ac5f9b71684f6c7e6e8ce45686776916e6a82ca..ad10abb1c2ba81773dcce01c28903c9cef8ae4b3 100644 (file)
@@ -18,15 +18,22 @@ def OP_NONE  : Op;
 def OP_UNAVAILABLE : Op;
 def OP_ADD   : Op;
 def OP_ADDL  : Op;
+def OP_ADDLHi : Op;
 def OP_ADDW  : Op;
+def OP_ADDWHi : Op;
 def OP_SUB   : Op;
 def OP_SUBL  : Op;
+def OP_SUBLHi : Op;
 def OP_SUBW  : Op;
+def OP_SUBWHi : Op;
 def OP_MUL   : Op;
 def OP_MLA   : Op;
 def OP_MLAL  : Op;
+def OP_MULLHi : Op;
+def OP_MLALHi : Op;
 def OP_MLS   : Op;
 def OP_MLSL  : Op;
+def OP_MLSLHi : Op;
 def OP_MUL_N : Op;
 def OP_MLA_N : Op;
 def OP_MLS_N : Op;
@@ -66,9 +73,18 @@ def OP_REV64 : Op;
 def OP_REV32 : Op;
 def OP_REV16 : Op;
 def OP_REINT : Op;
+def OP_ADDHNHi : Op;
+def OP_RADDHNHi : Op;
+def OP_SUBHNHi : Op;
+def OP_RSUBHNHi : Op;
 def OP_ABDL  : Op;
+def OP_ABDLHi : Op;
 def OP_ABA   : Op;
 def OP_ABAL  : Op;
+def OP_ABALHi : Op;
+def OP_QDMULLHi : Op;
+def OP_QDMLALHi : Op;
+def OP_QDMLSLHi : Op;
 def OP_DIV  : Op;
 def OP_LONG_HI : Op;
 def OP_NARROW_HI : Op;
@@ -133,6 +149,7 @@ class NoTestOpInst<string n, string p, string t, Op o> : Inst<n, p, t, o> {}
 // w: double width elements, same num elts
 // n: double width elements, half num elts
 // h: half width elements, double num elts
+// q: half width elements, quad num elts
 // e: half width elements, double num elts, unsigned
 // m: half width elements, same num elts
 // i: constant int
@@ -589,6 +606,29 @@ def FCVTZS_N_S64 : SInst<"vcvt_n_s64", "xdi", "Qd">;
 def FCVTZS_N_U64 : SInst<"vcvt_n_u64", "udi", "Qd">;
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// 3VDiff class using high 64-bit in operands
+def VADDL_HIGH   : SOpInst<"vaddl_high", "wkk", "csiUcUsUi", OP_ADDLHi>;
+def VADDW_HIGH   : SOpInst<"vaddw_high", "wwk", "csiUcUsUi", OP_ADDWHi>;
+def VSUBL_HIGH   : SOpInst<"vsubl_high", "wkk", "csiUcUsUi", OP_SUBLHi>;
+def VSUBW_HIGH   : SOpInst<"vsubw_high", "wwk", "csiUcUsUi", OP_SUBWHi>;
+
+def VABDL_HIGH   : SOpInst<"vabdl_high", "wkk",  "csiUcUsUi", OP_ABDLHi>;
+def VABAL_HIGH   : SOpInst<"vabal_high", "wwkk", "csiUcUsUi", OP_ABALHi>;
+
+def VMULL_HIGH   : SOpInst<"vmull_high", "wkk", "csiUcUsUiPc", OP_MULLHi>;
+def VMLAL_HIGH   : SOpInst<"vmlal_high", "wwkk", "csiUcUsUi", OP_MLALHi>;
+def VMLSL_HIGH   : SOpInst<"vmlsl_high", "wwkk", "csiUcUsUi", OP_MLSLHi>;
+
+def VADDHN_HIGH  : SOpInst<"vaddhn_high", "qhkk", "silUsUiUl", OP_ADDHNHi>;
+def VRADDHN_HIGH : SOpInst<"vraddhn_high", "qhkk", "silUsUiUl", OP_RADDHNHi>;
+def VSUBHN_HIGH  : SOpInst<"vsubhn_high", "qhkk", "silUsUiUl", OP_SUBHNHi>;
+def VRSUBHN_HIGH : SOpInst<"vrsubhn_high", "qhkk", "silUsUiUl", OP_RSUBHNHi>;
+
+def VQDMULL_HIGH : SOpInst<"vqdmull_high", "wkk", "si", OP_QDMULLHi>;
+def VQDMLAL_HIGH : SOpInst<"vqdmlal_high", "wwkk", "si", OP_QDMLALHi>;
+def VQDMLSL_HIGH : SOpInst<"vqdmlsl_high", "wwkk", "si", OP_QDMLSLHi>;
+
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Arithmetic
 
index 6bf5d6f54aff2bdb5a6c6a1717f912de0a705ff4..5b5b39f5e34b482c97fcef9947bcd3ce221c91f3 100644 (file)
@@ -1840,6 +1840,22 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqrshl_v, E);
   case AArch64::BI__builtin_neon_vqrshlq_v:
     return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqrshlq_v, E);
+  case AArch64::BI__builtin_neon_vaddhn_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vaddhn_v, E);
+  case AArch64::BI__builtin_neon_vraddhn_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vraddhn_v, E);
+  case AArch64::BI__builtin_neon_vsubhn_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vsubhn_v, E);
+  case AArch64::BI__builtin_neon_vrsubhn_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vrsubhn_v, E);
+  case AArch64::BI__builtin_neon_vmull_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmull_v, E);
+  case AArch64::BI__builtin_neon_vqdmull_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqdmull_v, E);
+  case AArch64::BI__builtin_neon_vqdmlal_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqdmlal_v, E);
+  case AArch64::BI__builtin_neon_vqdmlsl_v:
+    return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vqdmlsl_v, E);
   case AArch64::BI__builtin_neon_vmax_v:
     return EmitARMBuiltinExpr(ARM::BI__builtin_neon_vmax_v, E);
   case AArch64::BI__builtin_neon_vmaxq_v:
index 9dce70d50a97ba1c69c844d8f3c0dc759da11039..53ec130184c8befc75e1fe66f47a4f5db35ea302 100644 (file)
@@ -4274,3 +4274,971 @@ uint64x2_t test_vcvtq_n_u64_f64(float64x2_t a) {
   return vcvtq_n_u64_f64(a, 50);
   // CHECK: fcvtzu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
 }
+
+int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
+  // CHECK: test_vaddl_s8
+  return vaddl_s8(a, b);
+  // CHECK: saddl {{v[0-31]+}}.8h, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+}
+
+int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
+  // CHECK: test_vaddl_s16
+  return vaddl_s16(a, b);
+  // CHECK: saddl {{v[0-31]+}}.4s, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+}
+
+int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
+  // CHECK: test_vaddl_s32
+  return vaddl_s32(a, b);
+  // CHECK: saddl {{v[0-31]+}}.2d, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+}
+
+uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
+  // CHECK: test_vaddl_u8
+  return vaddl_u8(a, b);
+  // CHECK: uaddl {{v[0-31]+}}.8h, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+}
+
+uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
+  // CHECK: test_vaddl_u16
+  return vaddl_u16(a, b);
+  // CHECK: uaddl {{v[0-31]+}}.4s, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+}
+
+uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
+  // CHECK: test_vaddl_u32
+  return vaddl_u32(a, b);
+  // CHECK: uaddl {{v[0-31]+}}.2d, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+}
+
+int16x8_t test_vaddl_high_s8(int8x16_t a, int8x16_t b) {
+  // CHECK: test_vaddl_high_s8
+  return vaddl_high_s8(a, b);
+  // CHECK: saddl2 {{v[0-31]+}}.8h, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+}
+
+int32x4_t test_vaddl_high_s16(int16x8_t a, int16x8_t b) {
+  // CHECK: test_vaddl_high_s16
+  return vaddl_high_s16(a, b);
+  // CHECK: saddl2 {{v[0-31]+}}.4s, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+int64x2_t test_vaddl_high_s32(int32x4_t a, int32x4_t b) {
+  // CHECK: test_vaddl_high_s32
+  return vaddl_high_s32(a, b);
+  // CHECK: saddl2 {{v[0-31]+}}.2d, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+uint16x8_t test_vaddl_high_u8(uint8x16_t a, uint8x16_t b) {
+  // CHECK: test_vaddl_high_u8
+  return vaddl_high_u8(a, b);
+  // CHECK: uaddl2 {{v[0-31]+}}.8h, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+}
+
+uint32x4_t test_vaddl_high_u16(uint16x8_t a, uint16x8_t b) {
+  // CHECK: test_vaddl_high_u16
+  return vaddl_high_u16(a, b);
+  // CHECK: uaddl2 {{v[0-31]+}}.4s, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+uint64x2_t test_vaddl_high_u32(uint32x4_t a, uint32x4_t b) {
+  // CHECK: test_vaddl_high_u32
+  return vaddl_high_u32(a, b);
+  // CHECK: uaddl2 {{v[0-31]+}}.2d, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
+  // CHECK: test_vaddw_s8
+  return vaddw_s8(a, b);
+  // CHECK: saddw {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8b
+}
+
+int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
+  // CHECK: test_vaddw_s16
+  return vaddw_s16(a, b);
+  // CHECK: saddw {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4h
+}
+
+int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
+  // CHECK: test_vaddw_s32
+  return vaddw_s32(a, b);
+  // CHECK: saddw {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2s
+}
+
+uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
+  // CHECK: test_vaddw_u8
+  return vaddw_u8(a, b);
+  // CHECK: uaddw {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8b
+}
+
+uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
+  // CHECK: test_vaddw_u16
+  return vaddw_u16(a, b);
+  // CHECK: uaddw {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4h
+}
+
+uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
+  // CHECK: test_vaddw_u32
+  return vaddw_u32(a, b);
+  // CHECK: uaddw {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2s
+}
+
+int16x8_t test_vaddw_high_s8(int16x8_t a, int8x16_t b) {
+  // CHECK: test_vaddw_high_s8
+  return vaddw_high_s8(a, b);
+  // CHECK: saddw2 {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.16b
+}
+
+int32x4_t test_vaddw_high_s16(int32x4_t a, int16x8_t b) {
+  // CHECK: test_vaddw_high_s16
+  return vaddw_high_s16(a, b);
+  // CHECK: saddw2 {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.8h
+}
+
+int64x2_t test_vaddw_high_s32(int64x2_t a, int32x4_t b) {
+  // CHECK: test_vaddw_high_s32
+  return vaddw_high_s32(a, b);
+  // CHECK: saddw2 {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.4s
+}
+
+uint16x8_t test_vaddw_high_u8(uint16x8_t a, uint8x16_t b) {
+  // CHECK: test_vaddw_high_u8
+  return vaddw_high_u8(a, b);
+  // CHECK: uaddw2 {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.16b
+}
+
+uint32x4_t test_vaddw_high_u16(uint32x4_t a, uint16x8_t b) {
+  // CHECK: test_vaddw_high_u16
+  return vaddw_high_u16(a, b);
+  // CHECK: uaddw2 {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.8h
+}
+
+uint64x2_t test_vaddw_high_u32(uint64x2_t a, uint32x4_t b) {
+  // CHECK: test_vaddw_high_u32
+  return vaddw_high_u32(a, b);
+  // CHECK: uaddw2 {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.4s
+}
+
+int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
+  // CHECK: test_vsubl_s8
+  return vsubl_s8(a, b);
+  // CHECK: ssubl {{v[0-31]+}}.8h, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+}
+
+int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
+  // CHECK: test_vsubl_s16
+  return vsubl_s16(a, b);
+  // CHECK: ssubl {{v[0-31]+}}.4s, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+}
+
+int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
+  // CHECK: test_vsubl_s32
+  return vsubl_s32(a, b);
+  // CHECK: ssubl {{v[0-31]+}}.2d, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+}
+
+uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
+  // CHECK: test_vsubl_u8
+  return vsubl_u8(a, b);
+  // CHECK: usubl {{v[0-31]+}}.8h, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+}
+
+uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
+  // CHECK: test_vsubl_u16
+  return vsubl_u16(a, b);
+  // CHECK: usubl {{v[0-31]+}}.4s, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+}
+
+uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
+  // CHECK: test_vsubl_u32
+  return vsubl_u32(a, b);
+  // CHECK: usubl {{v[0-31]+}}.2d, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+}
+
+int16x8_t test_vsubl_high_s8(int8x16_t a, int8x16_t b) {
+  // CHECK: test_vsubl_high_s8
+  return vsubl_high_s8(a, b);
+  // CHECK: ssubl2 {{v[0-31]+}}.8h, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+}
+
+int32x4_t test_vsubl_high_s16(int16x8_t a, int16x8_t b) {
+  // CHECK: test_vsubl_high_s16
+  return vsubl_high_s16(a, b);
+  // CHECK: ssubl2 {{v[0-31]+}}.4s, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+int64x2_t test_vsubl_high_s32(int32x4_t a, int32x4_t b) {
+  // CHECK: test_vsubl_high_s32
+  return vsubl_high_s32(a, b);
+  // CHECK: ssubl2 {{v[0-31]+}}.2d, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+uint16x8_t test_vsubl_high_u8(uint8x16_t a, uint8x16_t b) {
+  // CHECK: test_vsubl_high_u8
+  return vsubl_high_u8(a, b);
+  // CHECK: usubl2 {{v[0-31]+}}.8h, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+}
+
+uint32x4_t test_vsubl_high_u16(uint16x8_t a, uint16x8_t b) {
+  // CHECK: test_vsubl_high_u16
+  return vsubl_high_u16(a, b);
+  // CHECK: usubl2 {{v[0-31]+}}.4s, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+uint64x2_t test_vsubl_high_u32(uint32x4_t a, uint32x4_t b) {
+  // CHECK: test_vsubl_high_u32
+  return vsubl_high_u32(a, b);
+  // CHECK: usubl2 {{v[0-31]+}}.2d, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
+  // CHECK: test_vsubw_s8
+  return vsubw_s8(a, b);
+  // CHECK: ssubw {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8b
+}
+
+int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
+  // CHECK: test_vsubw_s16
+  return vsubw_s16(a, b);
+  // CHECK: ssubw {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4h
+}
+
+int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) {
+  // CHECK: test_vsubw_s32
+  return vsubw_s32(a, b);
+  // CHECK: ssubw {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2s
+}
+
+uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
+  // CHECK: test_vsubw_u8
+  return vsubw_u8(a, b);
+  // CHECK: usubw {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8b
+}
+
+uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
+  // CHECK: test_vsubw_u16
+  return vsubw_u16(a, b);
+  // CHECK: usubw {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4h
+}
+
+uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) {
+  // CHECK: test_vsubw_u32
+  return vsubw_u32(a, b);
+  // CHECK: usubw {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2s
+}
+
+int16x8_t test_vsubw_high_s8(int16x8_t a, int8x16_t b) {
+  // CHECK: test_vsubw_high_s8
+  return vsubw_high_s8(a, b);
+  // CHECK: ssubw2 {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.16b
+}
+
+int32x4_t test_vsubw_high_s16(int32x4_t a, int16x8_t b) {
+  // CHECK: test_vsubw_high_s16
+  return vsubw_high_s16(a, b);
+  // CHECK: ssubw2 {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.8h
+}
+
+int64x2_t test_vsubw_high_s32(int64x2_t a, int32x4_t b) {
+  // CHECK: test_vsubw_high_s32
+  return vsubw_high_s32(a, b);
+  // CHECK: ssubw2 {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.4s
+}
+
+uint16x8_t test_vsubw_high_u8(uint16x8_t a, uint8x16_t b) {
+  // CHECK: test_vsubw_high_u8
+  return vsubw_high_u8(a, b);
+  // CHECK: usubw2 {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.16b
+}
+
+uint32x4_t test_vsubw_high_u16(uint32x4_t a, uint16x8_t b) {
+  // CHECK: test_vsubw_high_u16
+  return vsubw_high_u16(a, b);
+  // CHECK: usubw2 {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.8h
+}
+
+uint64x2_t test_vsubw_high_u32(uint64x2_t a, uint32x4_t b) {
+  // CHECK: test_vsubw_high_u32
+  return vsubw_high_u32(a, b);
+  // CHECK: usubw2 {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.4s
+}
+
+int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
+  // CHECK: test_vaddhn_s16
+  return vaddhn_s16(a, b);
+  // CHECK: addhn {{v[0-31]+}}.8b, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
+  // CHECK: test_vaddhn_s32
+  return vaddhn_s32(a, b);
+  // CHECK: addhn {{v[0-31]+}}.4h, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
+  // CHECK: test_vaddhn_s64
+  return vaddhn_s64(a, b);
+  // CHECK: addhn {{v[0-31]+}}.2s, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+}
+
+uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
+  // CHECK: test_vaddhn_u16
+  return vaddhn_u16(a, b);
+  // CHECK: addhn {{v[0-31]+}}.8b, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
+  // CHECK: test_vaddhn_u32
+  return vaddhn_u32(a, b);
+  // CHECK: addhn {{v[0-31]+}}.4h, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
+  // CHECK: test_vaddhn_u64
+  return vaddhn_u64(a, b);
+  // CHECK: addhn {{v[0-31]+}}.2s, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+}
+
+int8x16_t test_vaddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
+  // CHECK: test_vaddhn_high_s16
+  return vaddhn_high_s16(r, a, b);
+  // CHECK: addhn2 {{v[0-31]+}}.16b, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+int16x8_t test_vaddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
+  // CHECK: test_vaddhn_high_s32
+  return vaddhn_high_s32(r, a, b);
+  // CHECK: addhn2 {{v[0-31]+}}.8h, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+int32x4_t test_vaddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
+  // CHECK: test_vaddhn_high_s64
+  return vaddhn_high_s64(r, a, b);
+  // CHECK: addhn2 {{v[0-31]+}}.4s, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+}
+
+uint8x16_t test_vaddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
+  // CHECK: test_vaddhn_high_u16
+  return vaddhn_high_u16(r, a, b);
+  // CHECK: addhn2 {{v[0-31]+}}.16b, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+uint16x8_t test_vaddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
+  // CHECK: test_vaddhn_high_u32
+  return vaddhn_high_u32(r, a, b);
+  // CHECK: addhn2 {{v[0-31]+}}.8h, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+uint32x4_t test_vaddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
+  // CHECK: test_vaddhn_high_u64
+  return vaddhn_high_u64(r, a, b);
+  // CHECK: addhn2 {{v[0-31]+}}.4s, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+}
+
+int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
+  // CHECK: test_vraddhn_s16
+  return vraddhn_s16(a, b);
+  // CHECK: raddhn {{v[0-31]+}}.8b, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
+  // CHECK: test_vraddhn_s32
+  return vraddhn_s32(a, b);
+  // CHECK: raddhn {{v[0-31]+}}.4h, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
+  // CHECK: test_vraddhn_s64
+  return vraddhn_s64(a, b);
+  // CHECK: raddhn {{v[0-31]+}}.2s, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+}
+
+uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
+  // CHECK: test_vraddhn_u16
+  return vraddhn_u16(a, b);
+  // CHECK: raddhn {{v[0-31]+}}.8b, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
+  // CHECK: test_vraddhn_u32
+  return vraddhn_u32(a, b);
+  // CHECK: raddhn {{v[0-31]+}}.4h, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
+  // CHECK: test_vraddhn_u64
+  return vraddhn_u64(a, b);
+  // CHECK: raddhn {{v[0-31]+}}.2s, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+}
+
+int8x16_t test_vraddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
+  // CHECK: test_vraddhn_high_s16
+  return vraddhn_high_s16(r, a, b);
+  // CHECK: raddhn2 {{v[0-31]+}}.16b, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+int16x8_t test_vraddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
+  // CHECK: test_vraddhn_high_s32
+  return vraddhn_high_s32(r, a, b);
+  // CHECK: raddhn2 {{v[0-31]+}}.8h, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+int32x4_t test_vraddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
+  // CHECK: test_vraddhn_high_s64
+  return vraddhn_high_s64(r, a, b);
+  // CHECK: raddhn2 {{v[0-31]+}}.4s, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+}
+
+uint8x16_t test_vraddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
+  // CHECK: test_vraddhn_high_u16
+  return vraddhn_high_u16(r, a, b);
+  // CHECK: raddhn2 {{v[0-31]+}}.16b, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+uint16x8_t test_vraddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
+  // CHECK: test_vraddhn_high_u32
+  return vraddhn_high_u32(r, a, b);
+  // CHECK: raddhn2 {{v[0-31]+}}.8h, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+uint32x4_t test_vraddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
+  // CHECK: test_vraddhn_high_u64
+  return vraddhn_high_u64(r, a, b);
+  // CHECK: raddhn2 {{v[0-31]+}}.4s, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+}
+
+int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
+  // CHECK: test_vsubhn_s16
+  return vsubhn_s16(a, b);
+  // CHECK: subhn {{v[0-31]+}}.8b, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
+  // CHECK: test_vsubhn_s32
+  return vsubhn_s32(a, b);
+  // CHECK: subhn {{v[0-31]+}}.4h, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
+  // CHECK: test_vsubhn_s64
+  return vsubhn_s64(a, b);
+  // CHECK: subhn {{v[0-31]+}}.2s, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+}
+
+uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
+  // CHECK: test_vsubhn_u16
+  return vsubhn_u16(a, b);
+  // CHECK: subhn {{v[0-31]+}}.8b, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
+  // CHECK: test_vsubhn_u32
+  return vsubhn_u32(a, b);
+  // CHECK: subhn {{v[0-31]+}}.4h, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
+  // CHECK: test_vsubhn_u64
+  return vsubhn_u64(a, b);
+  // CHECK: subhn {{v[0-31]+}}.2s, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+}
+
+int8x16_t test_vsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
+  // CHECK: test_vsubhn_high_s16
+  return vsubhn_high_s16(r, a, b);
+  // CHECK: subhn2 {{v[0-31]+}}.16b, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+int16x8_t test_vsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
+  // CHECK: test_vsubhn_high_s32
+  return vsubhn_high_s32(r, a, b);
+  // CHECK: subhn2 {{v[0-31]+}}.8h, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+int32x4_t test_vsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
+  // CHECK: test_vsubhn_high_s64
+  return vsubhn_high_s64(r, a, b);
+  // CHECK: subhn2 {{v[0-31]+}}.4s, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+}
+
+uint8x16_t test_vsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
+  // CHECK: test_vsubhn_high_u16
+  return vsubhn_high_u16(r, a, b);
+  // CHECK: subhn2 {{v[0-31]+}}.16b, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+uint16x8_t test_vsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
+  // CHECK: test_vsubhn_high_u32
+  return vsubhn_high_u32(r, a, b);
+  // CHECK: subhn2 {{v[0-31]+}}.8h, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+uint32x4_t test_vsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
+  // CHECK: test_vsubhn_high_u64
+  return vsubhn_high_u64(r, a, b);
+  // CHECK: subhn2 {{v[0-31]+}}.4s, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+}
+
+int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
+  // CHECK: test_vrsubhn_s16
+  return vrsubhn_s16(a, b);
+  // CHECK: rsubhn {{v[0-31]+}}.8b, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
+  // CHECK: test_vrsubhn_s32
+  return vrsubhn_s32(a, b);
+  // CHECK: rsubhn {{v[0-31]+}}.4h, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
+  // CHECK: test_vrsubhn_s64
+  return vrsubhn_s64(a, b);
+  // CHECK: rsubhn {{v[0-31]+}}.2s, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+}
+
+uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
+  // CHECK: test_vrsubhn_u16
+  return vrsubhn_u16(a, b);
+  // CHECK: rsubhn {{v[0-31]+}}.8b, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
+  // CHECK: test_vrsubhn_u32
+  return vrsubhn_u32(a, b);
+  // CHECK: rsubhn {{v[0-31]+}}.4h, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
+  // CHECK: test_vrsubhn_u64
+  return vrsubhn_u64(a, b);
+  // CHECK: rsubhn {{v[0-31]+}}.2s, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+}
+
+int8x16_t test_vrsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
+  // CHECK: test_vrsubhn_high_s16
+  return vrsubhn_high_s16(r, a, b);
+  // CHECK: rsubhn2 {{v[0-31]+}}.16b, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+int16x8_t test_vrsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
+  // CHECK: test_vrsubhn_high_s32
+  return vrsubhn_high_s32(r, a, b);
+  // CHECK: rsubhn2 {{v[0-31]+}}.8h, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+int32x4_t test_vrsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
+  // CHECK: test_vrsubhn_high_s64
+  return vrsubhn_high_s64(r, a, b);
+  // CHECK: rsubhn2 {{v[0-31]+}}.4s, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+}
+
+uint8x16_t test_vrsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
+  // CHECK: test_vrsubhn_high_u16
+  return vrsubhn_high_u16(r, a, b);
+  // CHECK: rsubhn2 {{v[0-31]+}}.16b, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+uint16x8_t test_vrsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
+  // CHECK: test_vrsubhn_high_u32
+  return vrsubhn_high_u32(r, a, b);
+  // CHECK: rsubhn2 {{v[0-31]+}}.8h, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+uint32x4_t test_vrsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
+  // CHECK: test_vrsubhn_high_u64
+  return vrsubhn_high_u64(r, a, b);
+  // CHECK: rsubhn2 {{v[0-31]+}}.4s, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+}
+
+int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
+  // CHECK: test_vabdl_s8
+  return vabdl_s8(a, b);
+  // CHECK: sabdl {{v[0-31]+}}.8h, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+}
+int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
+  // CHECK: test_vabdl_s16
+  return vabdl_s16(a, b);
+  // CHECK: sabdl {{v[0-31]+}}.4s, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+}
+int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
+  // CHECK: test_vabdl_s32
+  return vabdl_s32(a, b);
+  // CHECK: sabdl {{v[0-31]+}}.2d, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+}
+uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
+  // CHECK: test_vabdl_u8
+  return vabdl_u8(a, b);
+  // CHECK: uabdl {{v[0-31]+}}.8h, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+}
+uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
+  // CHECK: test_vabdl_u16
+  return vabdl_u16(a, b);
+  // CHECK: uabdl {{v[0-31]+}}.4s, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+}
+uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
+  // CHECK: test_vabdl_u32
+  return vabdl_u32(a, b);
+  // CHECK: uabdl {{v[0-31]+}}.2d, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+}
+
+int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
+  // CHECK: test_vabal_s8
+  return vabal_s8(a, b, c);
+  // CHECK: sabal {{v[0-31]+}}.8h, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+}
+int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
+  // CHECK: test_vabal_s16
+  return vabal_s16(a, b, c);
+  // CHECK: sabal {{v[0-31]+}}.4s, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+}
+int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
+  // CHECK: test_vabal_s32
+  return vabal_s32(a, b, c);
+  // CHECK: sabal {{v[0-31]+}}.2d, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+}
+uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
+  // CHECK: test_vabal_u8
+  return vabal_u8(a, b, c);
+  // CHECK: uabal {{v[0-31]+}}.8h, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+}
+uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
+  // CHECK: test_vabal_u16
+  return vabal_u16(a, b, c);
+  // CHECK: uabal {{v[0-31]+}}.4s, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+}
+uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
+  // CHECK: test_vabal_u32
+  return vabal_u32(a, b, c);
+  // CHECK: uabal {{v[0-31]+}}.2d, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+}
+
+int16x8_t test_vabdl_high_s8(int8x16_t a, int8x16_t b) {
+  // CHECK: test_vabdl_high_s8
+  return vabdl_high_s8(a, b);
+  // CHECK: sabdl2 {{v[0-31]+}}.8h, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+}
+int32x4_t test_vabdl_high_s16(int16x8_t a, int16x8_t b) {
+  // CHECK: test_vabdl_high_s16
+  return vabdl_high_s16(a, b);
+  // CHECK: sabdl2 {{v[0-31]+}}.4s, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+int64x2_t test_vabdl_high_s32(int32x4_t a, int32x4_t b) {
+  // CHECK: test_vabdl_high_s32
+  return vabdl_high_s32(a, b);
+  // CHECK: sabdl2 {{v[0-31]+}}.2d, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+uint16x8_t test_vabdl_high_u8(uint8x16_t a, uint8x16_t b) {
+  // CHECK: test_vabdl_high_u8
+  return vabdl_high_u8(a, b);
+  // CHECK: uabdl2 {{v[0-31]+}}.8h, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+}
+uint32x4_t test_vabdl_high_u16(uint16x8_t a, uint16x8_t b) {
+  // CHECK: test_vabdl_high_u16
+  return vabdl_high_u16(a, b);
+  // CHECK: uabdl2 {{v[0-31]+}}.4s, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+uint64x2_t test_vabdl_high_u32(uint32x4_t a, uint32x4_t b) {
+  // CHECK: test_vabdl_high_u32
+  return vabdl_high_u32(a, b);
+  // CHECK: uabdl2 {{v[0-31]+}}.2d, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+int16x8_t test_vabal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
+  // CHECK: test_vabal_high_s8
+  return vabal_high_s8(a, b, c);
+  // CHECK: sabal2 {{v[0-31]+}}.8h, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+}
+int32x4_t test_vabal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
+  // CHECK: test_vabal_high_s16
+  return vabal_high_s16(a, b, c);
+  // CHECK: sabal2 {{v[0-31]+}}.4s, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+int64x2_t test_vabal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
+  // CHECK: test_vabal_high_s32
+  return vabal_high_s32(a, b, c);
+  // CHECK: sabal2 {{v[0-31]+}}.2d, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+uint16x8_t test_vabal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
+  // CHECK: test_vabal_high_u8
+  return vabal_high_u8(a, b, c);
+  // CHECK: uabal2 {{v[0-31]+}}.8h, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+}
+uint32x4_t test_vabal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
+  // CHECK: test_vabal_high_u16
+  return vabal_high_u16(a, b, c);
+  // CHECK: uabal2 {{v[0-31]+}}.4s, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+uint64x2_t test_vabal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
+  // CHECK: test_vabal_high_u32
+  return vabal_high_u32(a, b, c);
+  // CHECK: uabal2 {{v[0-31]+}}.2d, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
+  // CHECK: test_vmull_s8
+  return vmull_s8(a, b);
+  // CHECK: smull {{v[0-31]+}}.8h, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+}
+int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
+  // CHECK: test_vmull_s16
+  return vmull_s16(a, b);
+  // CHECK: smull {{v[0-31]+}}.4s, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+}
+int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
+  // CHECK: test_vmull_s32
+  return vmull_s32(a, b);
+  // CHECK: smull {{v[0-31]+}}.2d, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+}
+uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
+  // CHECK: test_vmull_u8
+  return vmull_u8(a, b);
+  // CHECK: umull {{v[0-31]+}}.8h, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+}
+uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
+  // CHECK: test_vmull_u16
+  return vmull_u16(a, b);
+  // CHECK: umull {{v[0-31]+}}.4s, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+}
+uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
+  // CHECK: test_vmull_u32
+  return vmull_u32(a, b);
+  // CHECK: umull {{v[0-31]+}}.2d, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+}
+
+int16x8_t test_vmull_high_s8(int8x16_t a, int8x16_t b) {
+  // CHECK: test_vmull_high_s8
+  return vmull_high_s8(a, b);
+  // CHECK: smull2 {{v[0-31]+}}.8h, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+}
+int32x4_t test_vmull_high_s16(int16x8_t a, int16x8_t b) {
+  // CHECK: test_vmull_high_s16
+  return vmull_high_s16(a, b);
+  // CHECK: smull2 {{v[0-31]+}}.4s, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+int64x2_t test_vmull_high_s32(int32x4_t a, int32x4_t b) {
+  // CHECK: test_vmull_high_s32
+  return vmull_high_s32(a, b);
+  // CHECK: smull2 {{v[0-31]+}}.2d, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+uint16x8_t test_vmull_high_u8(uint8x16_t a, uint8x16_t b) {
+  // CHECK: test_vmull_high_u8
+  return vmull_high_u8(a, b);
+  // CHECK: umull2 {{v[0-31]+}}.8h, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+}
+uint32x4_t test_vmull_high_u16(uint16x8_t a, uint16x8_t b) {
+  // CHECK: test_vmull_high_u16
+  return vmull_high_u16(a, b);
+  // CHECK: umull2 {{v[0-31]+}}.4s, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+uint64x2_t test_vmull_high_u32(uint32x4_t a, uint32x4_t b) {
+  // CHECK: test_vmull_high_u32
+  return vmull_high_u32(a, b);
+  // CHECK: umull2 {{v[0-31]+}}.2d, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
+  // CHECK: test_vmlal_s8
+  return vmlal_s8(a, b, c);
+  // CHECK: smlal {{v[0-31]+}}.8h, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+}
+int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
+  // CHECK: test_vmlal_s16
+  return vmlal_s16(a, b, c);
+  // CHECK: smlal {{v[0-31]+}}.4s, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+}
+int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
+  // CHECK: test_vmlal_s32
+  return vmlal_s32(a, b, c);
+  // CHECK: smlal {{v[0-31]+}}.2d, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+}
+uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
+  // CHECK: test_vmlal_u8
+  return vmlal_u8(a, b, c);
+  // CHECK: umlal {{v[0-31]+}}.8h, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+}
+uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
+  // CHECK: test_vmlal_u16
+  return vmlal_u16(a, b, c);
+  // CHECK: umlal {{v[0-31]+}}.4s, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+}
+uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
+  // CHECK: test_vmlal_u32
+  return vmlal_u32(a, b, c);
+  // CHECK: umlal {{v[0-31]+}}.2d, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+}
+
+int16x8_t test_vmlal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
+  // CHECK: test_vmlal_high_s8
+  return vmlal_high_s8(a, b, c);
+  // CHECK: smlal2 {{v[0-31]+}}.8h, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+}
+int32x4_t test_vmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
+  // CHECK: test_vmlal_high_s16
+  return vmlal_high_s16(a, b, c);
+  // CHECK: smlal2 {{v[0-31]+}}.4s, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+int64x2_t test_vmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
+  // CHECK: test_vmlal_high_s32
+  return vmlal_high_s32(a, b, c);
+  // CHECK: smlal2 {{v[0-31]+}}.2d, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+uint16x8_t test_vmlal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
+  // CHECK: test_vmlal_high_u8
+  return vmlal_high_u8(a, b, c);
+  // CHECK: umlal2 {{v[0-31]+}}.8h, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+}
+uint32x4_t test_vmlal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
+  // CHECK: test_vmlal_high_u16
+  return vmlal_high_u16(a, b, c);
+  // CHECK: umlal2 {{v[0-31]+}}.4s, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+uint64x2_t test_vmlal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
+  // CHECK: test_vmlal_high_u32
+  return vmlal_high_u32(a, b, c);
+  // CHECK: umlal2 {{v[0-31]+}}.2d, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
+  // CHECK: test_vmlsl_s8
+  return vmlsl_s8(a, b, c);
+  // CHECK: smlsl {{v[0-31]+}}.8h, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+}
+int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
+  // CHECK: test_vmlsl_s16
+  return vmlsl_s16(a, b, c);
+  // CHECK: smlsl {{v[0-31]+}}.4s, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+}
+int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
+  // CHECK: test_vmlsl_s32
+  return vmlsl_s32(a, b, c);
+  // CHECK: smlsl {{v[0-31]+}}.2d, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+}
+uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
+  // CHECK: test_vmlsl_u8
+  return vmlsl_u8(a, b, c);
+  // CHECK: umlsl {{v[0-31]+}}.8h, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+}
+uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
+  // CHECK: test_vmlsl_u16
+  return vmlsl_u16(a, b, c);
+  // CHECK: umlsl {{v[0-31]+}}.4s, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+}
+uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
+  // CHECK: test_vmlsl_u32
+  return vmlsl_u32(a, b, c);
+  // CHECK: umlsl {{v[0-31]+}}.2d, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+}
+
+int16x8_t test_vmlsl_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
+  // CHECK: test_vmlsl_high_s8
+  return vmlsl_high_s8(a, b, c);
+  // CHECK: smlsl2 {{v[0-31]+}}.8h, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+}
+int32x4_t test_vmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
+  // CHECK: test_vmlsl_high_s16
+  return vmlsl_high_s16(a, b, c);
+  // CHECK: smlsl2 {{v[0-31]+}}.4s, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+int64x2_t test_vmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
+  // CHECK: test_vmlsl_high_s32
+  return vmlsl_high_s32(a, b, c);
+  // CHECK: smlsl2 {{v[0-31]+}}.2d, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+uint16x8_t test_vmlsl_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
+  // CHECK: test_vmlsl_high_u8
+  return vmlsl_high_u8(a, b, c);
+  // CHECK: umlsl2 {{v[0-31]+}}.8h, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+}
+uint32x4_t test_vmlsl_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
+  // CHECK: test_vmlsl_high_u16
+  return vmlsl_high_u16(a, b, c);
+  // CHECK: umlsl2 {{v[0-31]+}}.4s, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+uint64x2_t test_vmlsl_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
+  // CHECK: test_vmlsl_high_u32
+  return vmlsl_high_u32(a, b, c);
+  // CHECK: umlsl2 {{v[0-31]+}}.2d, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
+  // CHECK: test_vqdmull_s16
+  return vqdmull_s16(a, b);
+  // CHECK: sqdmull {{v[0-31]+}}.4s, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+}
+int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
+  // CHECK: test_vqdmull_s32
+  return vqdmull_s32(a, b);
+  // CHECK: sqdmull {{v[0-31]+}}.2d, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+}
+
+int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
+  // CHECK: test_vqdmlal_s16
+  return vqdmlal_s16(a, b, c);
+  // CHECK: sqdmlal {{v[0-31]+}}.4s, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+}
+
+int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
+  // CHECK: test_vqdmlal_s32
+  return vqdmlal_s32(a, b, c);
+  // CHECK: sqdmlal {{v[0-31]+}}.2d, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+}
+
+int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
+  // CHECK: test_vqdmlsl_s16
+  return vqdmlsl_s16(a, b, c);
+  // CHECK: sqdmlsl {{v[0-31]+}}.4s, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+}
+
+int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
+  // CHECK: test_vqdmlsl_s32
+  return vqdmlsl_s32(a, b, c);
+  // CHECK: sqdmlsl {{v[0-31]+}}.2d, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+}
+
+int32x4_t test_vqdmull_high_s16(int16x8_t a, int16x8_t b) {
+  // CHECK: test_vqdmull_high_s16
+  return vqdmull_high_s16(a, b);
+  // CHECK: sqdmull2 {{v[0-31]+}}.4s, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+int64x2_t test_vqdmull_high_s32(int32x4_t a, int32x4_t b) {
+  // CHECK: test_vqdmull_high_s32
+  return vqdmull_high_s32(a, b);
+  // CHECK: sqdmull2 {{v[0-31]+}}.2d, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+int32x4_t test_vqdmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
+  // CHECK: test_vqdmlal_high_s16
+  return vqdmlal_high_s16(a, b, c);
+  // CHECK: sqdmlal2 {{v[0-31]+}}.4s, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+int64x2_t test_vqdmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
+  // CHECK: test_vqdmlal_high_s32
+  return vqdmlal_high_s32(a, b, c);
+  // CHECK: sqdmlal2 {{v[0-31]+}}.2d, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+int32x4_t test_vqdmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
+  // CHECK: test_vqdmlsl_high_s16
+  return vqdmlsl_high_s16(a, b, c);
+  // CHECK: sqdmlsl2 {{v[0-31]+}}.4s, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+}
+
+int64x2_t test_vqdmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
+  // CHECK: test_vqdmlsl_high_s32
+  return vqdmlsl_high_s32(a, b, c);
+  // CHECK: sqdmlsl2 {{v[0-31]+}}.2d, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+}
+
+poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
+  // CHECK: test_vmull_p8
+  return vmull_p8(a, b);
+  // CHECK: pmull {{v[0-31]+}}.8h, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+}
+
+poly16x8_t test_vmull_high_p8(poly8x16_t a, poly8x16_t b) {
+  // CHECK: test_vmull_high_p8
+  return vmull_high_p8(a, b);
+  // CHECK: pmull2 {{v[0-31]+}}.8h, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+}
index d8f203d3df410ff440c565d1848ab33b98f42f6c..f700c6753a6f0890ba1856d494993294628630fd 100644 (file)
@@ -40,15 +40,22 @@ enum OpKind {
   OpUnavailable,
   OpAdd,
   OpAddl,
+  OpAddlHi,
   OpAddw,
+  OpAddwHi,
   OpSub,
   OpSubl,
+  OpSublHi,
   OpSubw,
+  OpSubwHi,
   OpMul,
   OpMla,
   OpMlal,
+  OpMullHi,
+  OpMlalHi,
   OpMls,
   OpMlsl,
+  OpMlslHi,
   OpMulN,
   OpMlaN,
   OpMlsN,
@@ -88,9 +95,18 @@ enum OpKind {
   OpRev32,
   OpRev64,
   OpReinterpret,
+  OpAddhnHi,
+  OpRAddhnHi,
+  OpSubhnHi,
+  OpRSubhnHi,
   OpAbdl,
+  OpAbdlHi,
   OpAba,
   OpAbal,
+  OpAbalHi,
+  OpQDMullHi,
+  OpQDMlalHi,
+  OpQDMlslHi,
   OpDiv,
   OpLongHi,
   OpNarrowHi,
@@ -159,15 +175,22 @@ public:
     OpMap["OP_UNAVAILABLE"] = OpUnavailable;
     OpMap["OP_ADD"]   = OpAdd;
     OpMap["OP_ADDL"]  = OpAddl;
+    OpMap["OP_ADDLHi"] = OpAddlHi;
     OpMap["OP_ADDW"]  = OpAddw;
+    OpMap["OP_ADDWHi"] = OpAddwHi;
     OpMap["OP_SUB"]   = OpSub;
     OpMap["OP_SUBL"]  = OpSubl;
+    OpMap["OP_SUBLHi"] = OpSublHi;
     OpMap["OP_SUBW"]  = OpSubw;
+    OpMap["OP_SUBWHi"] = OpSubwHi;
     OpMap["OP_MUL"]   = OpMul;
     OpMap["OP_MLA"]   = OpMla;
     OpMap["OP_MLAL"]  = OpMlal;
+    OpMap["OP_MULLHi"]  = OpMullHi;
+    OpMap["OP_MLALHi"]  = OpMlalHi;
     OpMap["OP_MLS"]   = OpMls;
     OpMap["OP_MLSL"]  = OpMlsl;
+    OpMap["OP_MLSLHi"] = OpMlslHi;
     OpMap["OP_MUL_N"] = OpMulN;
     OpMap["OP_MLA_N"] = OpMlaN;
     OpMap["OP_MLS_N"] = OpMlsN;
@@ -207,9 +230,18 @@ public:
     OpMap["OP_REV32"] = OpRev32;
     OpMap["OP_REV64"] = OpRev64;
     OpMap["OP_REINT"] = OpReinterpret;
+    OpMap["OP_ADDHNHi"] = OpAddhnHi;
+    OpMap["OP_RADDHNHi"] = OpRAddhnHi;
+    OpMap["OP_SUBHNHi"] = OpSubhnHi;
+    OpMap["OP_RSUBHNHi"] = OpRSubhnHi;
     OpMap["OP_ABDL"]  = OpAbdl;
+    OpMap["OP_ABDLHi"] = OpAbdlHi;
     OpMap["OP_ABA"]   = OpAba;
     OpMap["OP_ABAL"]  = OpAbal;
+    OpMap["OP_ABALHi"] = OpAbalHi;
+    OpMap["OP_QDMULLHi"] = OpQDMullHi;
+    OpMap["OP_QDMLALHi"] = OpQDMlalHi;
+    OpMap["OP_QDMLSLHi"] = OpQDMlslHi;
     OpMap["OP_DIV"] = OpDiv;
     OpMap["OP_LONG_HI"] = OpLongHi;
     OpMap["OP_NARROW_HI"] = OpNarrowHi;
@@ -326,6 +358,29 @@ static char Narrow(const char t) {
   }
 }
 
+static std::string GetNarrowTypestr(StringRef ty)
+{
+  std::string s;
+  for (size_t i = 0, end = ty.size(); i < end; i++) {
+    switch (ty[i]) {
+      case 's':
+        s += 'c';
+        break;
+      case 'i':
+        s += 's';
+        break;
+      case 'l':
+        s += 'i';
+        break;
+      default:
+        s += ty[i];
+        break;
+    }
+  }
+
+  return s;
+}
+
 /// For a particular StringRef, return the base type code, and whether it has
 /// the quad-vector, polynomial, or unsigned modifiers set.
 static char ClassifyType(StringRef ty, bool &quad, bool &poly, bool &usgn) {
@@ -426,6 +481,10 @@ static char ModType(const char mod, char type, bool &quad, bool &poly,
       if (type == 'h')
         quad = false;
       break;
+    case 'q':
+      type = Narrow(type);
+      quad = true;
+      break;
     case 'e':
       type = Narrow(type);
       usgn = true;
@@ -1286,13 +1345,60 @@ static std::string GenMacroLocals(const std::string &proto, StringRef typestr) {
 }
 
 // Use the vmovl builtin to sign-extend or zero-extend a vector.
-static std::string Extend(StringRef typestr, const std::string &a) {
+static std::string Extend(StringRef typestr, const std::string &a, bool h=0) {
+  std::string s, high;
+  high = h ? "_high" : "";
+  s = MangleName("vmovl" + high, typestr, ClassS);
+  s += "(" + a + ")";
+  return s;
+}
+
+// Get the high 64-bit part of a vector
+static std::string GetHigh(const std::string &a, StringRef typestr) {
   std::string s;
-  s = MangleName("vmovl", typestr, ClassS);
+  s = MangleName("vget_high", typestr, ClassS);
   s += "(" + a + ")";
   return s;
 }
 
+// Gen operation with two operands and get high 64-bit for both of two operands.
+static std::string Gen2OpWith2High(StringRef typestr,
+                                   const std::string &op,
+                                   const std::string &a,
+                                   const std::string &b) {
+  std::string s;
+  std::string Op1 = GetHigh(a, typestr);
+  std::string Op2 = GetHigh(b, typestr);
+  s = MangleName(op, typestr, ClassS);
+  s += "(" + Op1 + ", " + Op2 + ");";
+  return s;
+}
+
+// Gen operation with three operands and get high 64-bit of the latter 
+// two operands.
+static std::string Gen3OpWith2High(StringRef typestr,
+                                   const std::string &op,
+                                   const std::string &a,
+                                   const std::string &b,
+                                   const std::string &c) {
+  std::string s;
+  std::string Op1 = GetHigh(b, typestr);
+  std::string Op2 = GetHigh(c, typestr);
+  s = MangleName(op, typestr, ClassS);
+  s += "(" + a + ", " + Op1 + ", " + Op2 + ");";
+  return s;
+}
+
+// Gen combine operation by putting a on low 64-bit, and b on high 64-bit.
+static std::string GenCombine(std::string typestr,
+                              const std::string &a,
+                              const std::string &b) {
+  std::string s;
+  s = MangleName("vcombine", typestr, ClassS);
+  s += "(" + a + ", " + b + ")";
+  return s;
+}
+
 static std::string Duplicate(unsigned nElts, StringRef typestr,
                              const std::string &a) {
   std::string s;
@@ -1368,18 +1474,30 @@ static std::string GenOpString(const std::string &name, OpKind op,
   case OpAddl:
     s += Extend(typestr, "__a") + " + " + Extend(typestr, "__b") + ";";
     break;
+  case OpAddlHi:
+    s += Extend(typestr, "__a", 1) + " + " + Extend(typestr, "__b", 1) + ";";
+    break;
   case OpAddw:
     s += "__a + " + Extend(typestr, "__b") + ";";
     break;
+  case OpAddwHi:
+    s += "__a + " + Extend(typestr, "__b", 1) + ";";
+    break;
   case OpSub:
     s += "__a - __b;";
     break;
   case OpSubl:
     s += Extend(typestr, "__a") + " - " + Extend(typestr, "__b") + ";";
     break;
+  case OpSublHi:
+    s += Extend(typestr, "__a", 1) + " - " + Extend(typestr, "__b", 1) + ";";
+    break;
   case OpSubw:
     s += "__a - " + Extend(typestr, "__b") + ";";
     break;
+  case OpSubwHi:
+    s += "__a - " + Extend(typestr, "__b", 1) + ";";
+    break;
   case OpMulN:
     s += "__a * " + Duplicate(nElts, typestr, "__b") + ";";
     break;
@@ -1413,6 +1531,12 @@ static std::string GenOpString(const std::string &name, OpKind op,
   case OpMlal:
     s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, __c);";
     break;
+  case OpMullHi:
+    s += Gen2OpWith2High(typestr, "vmull", "__a", "__b");
+    break;
+  case OpMlalHi:
+    s += Gen3OpWith2High(typestr, "vmlal", "__a", "__b", "__c");
+    break;
   case OpMlsN:
     s += "__a - (__b * " + Duplicate(nElts, typestr, "__c") + ");";
     break;
@@ -1433,6 +1557,9 @@ static std::string GenOpString(const std::string &name, OpKind op,
   case OpMlsl:
     s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, __c);";
     break;
+  case OpMlslHi:
+    s += Gen3OpWith2High(typestr, "vmlsl", "__a", "__b", "__c");
+    break;
   case OpQDMullLane:
     s += MangleName("vqdmull", typestr, ClassS) + "(__a, " +
       SplatLane(nElts, "__b", "__c") + ");";
@@ -1560,23 +1687,51 @@ static std::string GenOpString(const std::string &name, OpKind op,
     }
     break;
   }
+  case OpAbdlHi:
+    s += Gen2OpWith2High(typestr, "vabdl", "__a", "__b");
+    break;
+  case OpAddhnHi: {
+    std::string addhn = MangleName("vaddhn", typestr, ClassS) + "(__b, __c)";
+    s += GenCombine(GetNarrowTypestr(typestr), "__a", addhn);
+    s += ";";
+    break;
+  }
+  case OpRAddhnHi: {
+    std::string raddhn = MangleName("vraddhn", typestr, ClassS) + "(__b, __c)";
+    s += GenCombine(GetNarrowTypestr(typestr), "__a", raddhn);
+    s += ";";
+    break;
+  }
+  case OpSubhnHi: {
+    std::string subhn = MangleName("vsubhn", typestr, ClassS) + "(__b, __c)";
+    s += GenCombine(GetNarrowTypestr(typestr), "__a", subhn);
+    s += ";";
+    break;
+  }
+  case OpRSubhnHi: {
+    std::string rsubhn = MangleName("vrsubhn", typestr, ClassS) + "(__b, __c)";
+    s += GenCombine(GetNarrowTypestr(typestr), "__a", rsubhn);
+    s += ";";
+    break;
+  }
   case OpAba:
     s += "__a + " + MangleName("vabd", typestr, ClassS) + "(__b, __c);";
     break;
-  case OpAbal: {
-    s += "__a + ";
-    std::string abd = MangleName("vabd", typestr, ClassS) + "(__b, __c)";
-    if (typestr[0] != 'U') {
-      // vabd results are always unsigned and must be zero-extended.
-      std::string utype = "U" + typestr.str();
-      s += "(" + TypeString(proto[0], typestr) + ")";
-      abd = "(" + TypeString('d', utype) + ")" + abd;
-      s += Extend(utype, abd) + ";";
-    } else {
-      s += Extend(typestr, abd) + ";";
-    }
+  case OpAbal:
+    s += "__a + " + MangleName("vabdl", typestr, ClassS) + "(__b, __c);";
+    break;
+  case OpAbalHi:
+    s += Gen3OpWith2High(typestr, "vabal", "__a", "__b", "__c");
+    break;
+  case OpQDMullHi:
+    s += Gen2OpWith2High(typestr, "vqdmull", "__a", "__b");
+    break;
+  case OpQDMlalHi:
+    s += Gen3OpWith2High(typestr, "vqdmlal", "__a", "__b", "__c");
+    break;
+  case OpQDMlslHi:
+    s += Gen3OpWith2High(typestr, "vqdmlsl", "__a", "__b", "__c");
     break;
-  }
   case OpDiv:
     s += "__a / __b;";
     break;
@@ -1993,6 +2148,7 @@ void NeonEmitter::run(raw_ostream &OS) {
   emitIntrinsic(OS, Records.getDef("VMOVL"), EmittedMap);
   emitIntrinsic(OS, Records.getDef("VMULL"), EmittedMap);
   emitIntrinsic(OS, Records.getDef("VABD"), EmittedMap);
+  emitIntrinsic(OS, Records.getDef("VABDL"), EmittedMap);
 
   // ARM intrinsics must be emitted before AArch64 intrinsics to ensure
   // common intrinsics appear only once in the output stream.
@@ -2014,6 +2170,10 @@ void NeonEmitter::run(raw_ostream &OS) {
   // Emit AArch64-specific intrinsics.
   OS << "#ifdef __aarch64__\n";
 
+  emitIntrinsic(OS, Records.getDef("VMOVL_HIGH"), EmittedMap);
+  emitIntrinsic(OS, Records.getDef("VMULL_HIGH"), EmittedMap);
+  emitIntrinsic(OS, Records.getDef("VABDL_HIGH"), EmittedMap);
+
   for (unsigned i = 0, e = RV.size(); i != e; ++i) {
     Record *R = RV[i];