From: Jiangning Liu Date: Fri, 4 Oct 2013 09:21:17 +0000 (+0000) Subject: Implement aarch64 neon instruction set AdvSIMD (3V elem). X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0aa1a88e19235574481e46e9e6e9ce66a9e6624f;p=clang Implement aarch64 neon instruction set AdvSIMD (3V elem). git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@191945 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/clang/Basic/arm_neon.td b/include/clang/Basic/arm_neon.td index b59843a6ea..0b5508ac65 100644 --- a/include/clang/Basic/arm_neon.td +++ b/include/clang/Basic/arm_neon.td @@ -40,16 +40,25 @@ def OP_MLS_N : Op; def OP_MLAL_N : Op; def OP_MLSL_N : Op; def OP_MUL_LN: Op; +def OP_MULX_LN: Op; def OP_MULL_LN : Op; +def OP_MULLHi_LN : Op; def OP_MLA_LN: Op; def OP_MLS_LN: Op; def OP_MLAL_LN : Op; +def OP_MLALHi_LN : Op; def OP_MLSL_LN : Op; +def OP_MLSLHi_LN : Op; def OP_QDMULL_LN : Op; +def OP_QDMULLHi_LN : Op; def OP_QDMLAL_LN : Op; +def OP_QDMLALHi_LN : Op; def OP_QDMLSL_LN : Op; +def OP_QDMLSLHi_LN : Op; def OP_QDMULH_LN : Op; def OP_QRDMULH_LN : Op; +def OP_FMS_LN : Op; +def OP_FMS_LNQ : Op; def OP_EQ : Op; def OP_GE : Op; def OP_LE : Op; @@ -146,6 +155,7 @@ class NoTestOpInst : Inst {} // f: float (int args) // d: default // g: default, ignore 'Q' size modifier. +// j: default, force 'Q' size modifier. // w: double width elements, same num elts // n: double width elements, half num elts // h: half width elements, double num elts @@ -503,7 +513,7 @@ def MLS : IOpInst<"vmls", "dddd", "csifUcUsUiQcQsQiQfQUcQUsQUiQd", OP_MLS>; //////////////////////////////////////////////////////////////////////////////// // Multiplication Extended -def MULX : SInst<"vmulx", "ddd", "fQfQd">; +def MULX : SInst<"vmulx", "ddd", "fdQfQd">; //////////////////////////////////////////////////////////////////////////////// // Division @@ -629,6 +639,63 @@ def VQDMULL_HIGH : SOpInst<"vqdmull_high", "wkk", "si", OP_QDMULLHi>; def VQDMLAL_HIGH : SOpInst<"vqdmlal_high", "wwkk", "si", OP_QDMLALHi>; def VQDMLSL_HIGH : SOpInst<"vqdmlsl_high", "wwkk", "si", OP_QDMLSLHi>; +//////////////////////////////////////////////////////////////////////////////// + +def VMLA_LANEQ : IOpInst<"vmla_laneq", "dddji", + "siUsUifQsQiQUsQUiQf", OP_MLA_LN>; +def VMLS_LANEQ : IOpInst<"vmls_laneq", "dddji", + "siUsUifQsQiQUsQUiQf", OP_MLS_LN>; + +def VFMA_LANE : IInst<"vfma_lane", "dddgi", "fdQfQd">; +def VFMA_LANEQ : IInst<"vfma_laneq", "dddji", "fdQfQd">; +def VFMS_LANE : IOpInst<"vfms_lane", "dddgi", "fdQfQd", OP_FMS_LN>; +def VFMS_LANEQ : IOpInst<"vfms_laneq", "dddji", "fdQfQd", OP_FMS_LNQ>; + +def VMLAL_LANEQ : SOpInst<"vmlal_laneq", "wwdki", "siUsUi", OP_MLAL_LN>; +def VMLAL_HIGH_LANE : SOpInst<"vmlal_high_lane", "wwkdi", "siUsUi", + OP_MLALHi_LN>; +def VMLAL_HIGH_LANEQ : SOpInst<"vmlal_high_laneq", "wwkki", "siUsUi", + OP_MLALHi_LN>; +def VMLSL_LANEQ : SOpInst<"vmlsl_laneq", "wwdki", "siUsUi", OP_MLSL_LN>; +def VMLSL_HIGH_LANE : SOpInst<"vmlsl_high_lane", "wwkdi", "siUsUi", + OP_MLSLHi_LN>; +def VMLSL_HIGH_LANEQ : SOpInst<"vmlsl_high_laneq", "wwkki", "siUsUi", + OP_MLSLHi_LN>; + +def VQDMLAL_LANEQ : SOpInst<"vqdmlal_laneq", "wwdki", "si", OP_QDMLAL_LN>; +def VQDMLAL_HIGH_LANE : SOpInst<"vqdmlal_high_lane", "wwkdi", "si", + OP_QDMLALHi_LN>; +def VQDMLAL_HIGH_LANEQ : SOpInst<"vqdmlal_high_laneq", "wwkki", "si", + OP_QDMLALHi_LN>; +def VQDMLSL_LANEQ : SOpInst<"vqdmlsl_laneq", "wwdki", "si", OP_QDMLSL_LN>; +def VQDMLSL_HIGH_LANE : SOpInst<"vqdmlsl_high_lane", "wwkdi", "si", + OP_QDMLSLHi_LN>; +def VQDMLSL_HIGH_LANEQ : SOpInst<"vqdmlsl_high_laneq", "wwkki", "si", + OP_QDMLSLHi_LN>; + +// Newly add double parameter for vmul_lane in aarch64 +def VMUL_LANE_A64 : IOpInst<"vmul_lane", "ddgi", "dQd", OP_MUL_LN>; + +def VMUL_LANEQ : IOpInst<"vmul_laneq", "ddji", + "sifdUsUiQsQiQfQUsQUiQfQd", OP_MUL_LN>; +def VMULL_LANEQ : SOpInst<"vmull_laneq", "wdki", "siUsUi", OP_MULL_LN>; +def VMULL_HIGH_LANE : SOpInst<"vmull_high_lane", "wkdi", "siUsUi", + OP_MULLHi_LN>; +def VMULL_HIGH_LANEQ : SOpInst<"vmull_high_laneq", "wkki", "siUsUi", + OP_MULLHi_LN>; + +def VQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "wdki", "si", OP_QDMULL_LN>; +def VQDMULL_HIGH_LANE : SOpInst<"vqdmull_high_lane", "wkdi", "si", + OP_QDMULLHi_LN>; +def VQDMULL_HIGH_LANEQ : SOpInst<"vqdmull_high_laneq", "wkki", "si", + OP_QDMULLHi_LN>; + +def VQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "ddji", "siQsQi", OP_QDMULH_LN>; +def VQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "ddji", "siQsQi", OP_QRDMULH_LN>; + +def VMULX_LANE : IOpInst<"vmulx_lane", "ddgi", "fdQfQd", OP_MULX_LN>; +def VMULX_LANEQ : IOpInst<"vmulx_laneq", "ddji", "fdQfQd", OP_MULX_LN>; + //////////////////////////////////////////////////////////////////////////////// // Scalar Arithmetic diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp index a6911bab34..b4caab2297 100644 --- a/lib/CodeGen/CGBuiltin.cpp +++ b/lib/CodeGen/CGBuiltin.cpp @@ -2222,6 +2222,46 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, } // AArch64-only builtins + case AArch64::BI__builtin_neon_vfma_lane_v: + case AArch64::BI__builtin_neon_vfmaq_laneq_v: { + Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + + Ops[2] = Builder.CreateBitCast(Ops[2], Ty); + Ops[2] = EmitNeonSplat(Ops[2], cast(Ops[3])); + return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]); + } + case AArch64::BI__builtin_neon_vfmaq_lane_v: { + Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + + llvm::VectorType *VTy = cast(Ty); + llvm::Type *STy = llvm::VectorType::get(VTy->getElementType(), + VTy->getNumElements() / 2); + Ops[2] = Builder.CreateBitCast(Ops[2], STy); + Value* SV = llvm::ConstantVector::getSplat(VTy->getNumElements(), + cast(Ops[3])); + Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane"); + + return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]); + } + case AArch64::BI__builtin_neon_vfma_laneq_v: { + Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty); + Ops[0] = Builder.CreateBitCast(Ops[0], Ty); + Ops[1] = Builder.CreateBitCast(Ops[1], Ty); + + llvm::VectorType *VTy = cast(Ty); + llvm::Type *STy = llvm::VectorType::get(VTy->getElementType(), + VTy->getNumElements() * 2); + Ops[2] = Builder.CreateBitCast(Ops[2], STy); + Value* SV = llvm::ConstantVector::getSplat(VTy->getNumElements(), + cast(Ops[3])); + Ops[2] = Builder.CreateShuffleVector(Ops[2], Ops[2], SV, "lane"); + + return Builder.CreateCall3(F, Ops[2], Ops[1], Ops[0]); + } case AArch64::BI__builtin_neon_vfms_v: case AArch64::BI__builtin_neon_vfmsq_v: { Value *F = CGM.getIntrinsic(Intrinsic::fma, Ty); diff --git a/test/CodeGen/aarch64-neon-2velem.c b/test/CodeGen/aarch64-neon-2velem.c new file mode 100644 index 0000000000..f34e11a3ce --- /dev/null +++ b/test/CodeGen/aarch64-neon-2velem.c @@ -0,0 +1,802 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \ +// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \ +// RUN: -S -O3 -o - %s | FileCheck %s + +// Test new aarch64 intrinsics and types + +#include + +int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmla_lane_s16 + return vmla_lane_s16(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlaq_lane_s16 + return vmlaq_lane_s16(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmla_lane_s32 + return vmla_lane_s32(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlaq_lane_s32 + return vmlaq_lane_s32(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmla_laneq_s16 + return vmla_laneq_s16(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlaq_laneq_s16 + return vmlaq_laneq_s16(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmla_laneq_s32 + return vmla_laneq_s32(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlaq_laneq_s32 + return vmlaq_laneq_s32(a, b, v, 1); + // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmls_lane_s16 + return vmls_lane_s16(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlsq_lane_s16 + return vmlsq_lane_s16(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmls_lane_s32 + return vmls_lane_s32(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlsq_lane_s32 + return vmlsq_lane_s32(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmls_laneq_s16 + return vmls_laneq_s16(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlsq_laneq_s16 + return vmlsq_laneq_s16(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmls_laneq_s32 + return vmls_laneq_s32(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlsq_laneq_s32 + return vmlsq_laneq_s32(a, b, v, 1); + // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) { + // CHECK: test_vmul_lane_s16 + return vmul_lane_s16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) { + // CHECK: test_vmulq_lane_s16 + return vmulq_lane_s16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) { + // CHECK: test_vmul_lane_s32 + return vmul_lane_s32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) { + // CHECK: test_vmulq_lane_s32 + return vmulq_lane_s32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) { + // CHECK: test_vmul_lane_u16 + return vmul_lane_u16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) { + // CHECK: test_vmulq_lane_u16 + return vmulq_lane_u16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) { + // CHECK: test_vmul_lane_u32 + return vmul_lane_u32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) { + // CHECK: test_vmulq_lane_u32 + return vmulq_lane_u32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) { + // CHECK: test_vmul_laneq_s16 + return vmul_laneq_s16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) { + // CHECK: test_vmulq_laneq_s16 + return vmulq_laneq_s16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) { + // CHECK: test_vmul_laneq_s32 + return vmul_laneq_s32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) { + // CHECK: test_vmulq_laneq_s32 + return vmulq_laneq_s32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) { + // CHECK: test_vmul_laneq_u16 + return vmul_laneq_u16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) { + // CHECK: test_vmulq_laneq_u16 + return vmulq_laneq_u16(a, v, 1); + // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) { + // CHECK: test_vmul_laneq_u32 + return vmul_laneq_u32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) { + // CHECK: test_vmulq_laneq_u32 + return vmulq_laneq_u32(a, v, 1); + // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { + // CHECK: test_vfma_lane_f32 + return vfma_lane_f32(a, b, v, 1); + // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { + // CHECK: test_vfmaq_lane_f32 + return vfmaq_lane_f32(a, b, v, 1); + // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { + // CHECK: test_vfma_laneq_f32 + return vfma_laneq_f32(a, b, v, 1); + // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { + // CHECK: test_vfmaq_laneq_f32 + return vfmaq_laneq_f32(a, b, v, 1); + // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { + // CHECK: test_vfms_lane_f32 + return vfms_lane_f32(a, b, v, 1); + // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { + // CHECK: test_vfmsq_lane_f32 + return vfmsq_lane_f32(a, b, v, 1); + // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { + // CHECK: test_vfms_laneq_f32 + return vfms_laneq_f32(a, b, v, 1); + // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { + // CHECK: test_vfmsq_laneq_f32 + return vfmsq_laneq_f32(a, b, v, 1); + // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { + // CHECK: test_vfmaq_lane_f64 + return vfmaq_lane_f64(a, b, v, 0); + // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) { + // CHECK: test_vfmaq_laneq_f64 + return vfmaq_laneq_f64(a, b, v, 0); + // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { + // CHECK: test_vfmaq_laneq_f64 + return vfmaq_laneq_f64(a, b, v, 1); + // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +} + +float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { + // CHECK: test_vfmsq_lane_f64 + return vfmsq_lane_f64(a, b, v, 0); + // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) { + // CHECK: test_vfmsq_laneq_f64 + return vfmsq_laneq_f64(a, b, v, 0); + // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { + // CHECK: test_vfmsq_laneq_f64 + return vfmsq_laneq_f64(a, b, v, 1); + // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +} + +int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmlal_lane_s16 + return vmlal_lane_s16(a, b, v, 1); + // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmlal_lane_s32 + return vmlal_lane_s32(a, b, v, 1); + // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmlal_laneq_s16 + return vmlal_laneq_s16(a, b, v, 1); + // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmlal_laneq_s32 + return vmlal_laneq_s32(a, b, v, 1); + // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlal_high_lane_s16 + return vmlal_high_lane_s16(a, b, v, 1); + // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlal_high_lane_s32 + return vmlal_high_lane_s32(a, b, v, 1); + // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlal_high_laneq_s16 + return vmlal_high_laneq_s16(a, b, v, 1); + // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlal_high_laneq_s32 + return vmlal_high_laneq_s32(a, b, v, 1); + // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmlsl_lane_s16 + return vmlsl_lane_s16(a, b, v, 1); + // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmlsl_lane_s32 + return vmlsl_lane_s32(a, b, v, 1); + // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmlsl_laneq_s16 + return vmlsl_laneq_s16(a, b, v, 1); + // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmlsl_laneq_s32 + return vmlsl_laneq_s32(a, b, v, 1); + // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlsl_high_lane_s16 + return vmlsl_high_lane_s16(a, b, v, 1); + // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlsl_high_lane_s32 + return vmlsl_high_lane_s32(a, b, v, 1); + // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlsl_high_laneq_s16 + return vmlsl_high_laneq_s16(a, b, v, 1); + // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlsl_high_laneq_s32 + return vmlsl_high_laneq_s32(a, b, v, 1); + // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmlal_lane_u16 + return vmlal_lane_u16(a, b, v, 1); + // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmlal_lane_u32 + return vmlal_lane_u32(a, b, v, 1); + // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmlal_laneq_u16 + return vmlal_laneq_u16(a, b, v, 1); + // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmlal_laneq_u32 + return vmlal_laneq_u32(a, b, v, 1); + // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlal_high_lane_u16 + return vmlal_high_lane_u16(a, b, v, 1); + // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlal_high_lane_u32 + return vmlal_high_lane_u32(a, b, v, 1); + // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlal_high_laneq_u16 + return vmlal_high_laneq_u16(a, b, v, 1); + // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlal_high_laneq_u32 + return vmlal_high_laneq_u32(a, b, v, 1); + // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vmlsl_lane_u16 + return vmlsl_lane_u16(a, b, v, 1); + // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vmlsl_lane_u32 + return vmlsl_lane_u32(a, b, v, 1); + // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { + // CHECK: test_vmlsl_laneq_u16 + return vmlsl_laneq_u16(a, b, v, 1); + // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { + // CHECK: test_vmlsl_laneq_u32 + return vmlsl_laneq_u32(a, b, v, 1); + // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vmlsl_high_lane_u16 + return vmlsl_high_lane_u16(a, b, v, 1); + // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vmlsl_high_lane_u32 + return vmlsl_high_lane_u32(a, b, v, 1); + // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { + // CHECK: test_vmlsl_high_laneq_u16 + return vmlsl_high_laneq_u16(a, b, v, 1); + // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { + // CHECK: test_vmlsl_high_laneq_u32 + return vmlsl_high_laneq_u32(a, b, v, 1); + // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) { + // CHECK: test_vmull_lane_s16 + return vmull_lane_s16(a, v, 1); + // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) { + // CHECK: test_vmull_lane_s32 + return vmull_lane_s32(a, v, 1); + // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) { + // CHECK: test_vmull_lane_u16 + return vmull_lane_u16(a, v, 1); + // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) { + // CHECK: test_vmull_lane_u32 + return vmull_lane_u32(a, v, 1); + // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) { + // CHECK: test_vmull_high_lane_s16 + return vmull_high_lane_s16(a, v, 1); + // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) { + // CHECK: test_vmull_high_lane_s32 + return vmull_high_lane_s32(a, v, 1); + // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) { + // CHECK: test_vmull_high_lane_u16 + return vmull_high_lane_u16(a, v, 1); + // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) { + // CHECK: test_vmull_high_lane_u32 + return vmull_high_lane_u32(a, v, 1); + // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) { + // CHECK: test_vmull_laneq_s16 + return vmull_laneq_s16(a, v, 1); + // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) { + // CHECK: test_vmull_laneq_s32 + return vmull_laneq_s32(a, v, 1); + // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) { + // CHECK: test_vmull_laneq_u16 + return vmull_laneq_u16(a, v, 1); + // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) { + // CHECK: test_vmull_laneq_u32 + return vmull_laneq_u32(a, v, 1); + // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) { + // CHECK: test_vmull_high_laneq_s16 + return vmull_high_laneq_s16(a, v, 1); + // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) { + // CHECK: test_vmull_high_laneq_s32 + return vmull_high_laneq_s32(a, v, 1); + // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) { + // CHECK: test_vmull_high_laneq_u16 + return vmull_high_laneq_u16(a, v, 1); + // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) { + // CHECK: test_vmull_high_laneq_u32 + return vmull_high_laneq_u32(a, v, 1); + // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vqdmlal_lane_s16 + return vqdmlal_lane_s16(a, b, v, 1); + // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vqdmlal_lane_s32 + return vqdmlal_lane_s32(a, b, v, 1); + // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vqdmlal_high_lane_s16 + return vqdmlal_high_lane_s16(a, b, v, 1); + // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vqdmlal_high_lane_s32 + return vqdmlal_high_lane_s32(a, b, v, 1); + // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { + // CHECK: test_vqdmlsl_lane_s16 + return vqdmlsl_lane_s16(a, b, v, 1); + // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { + // CHECK: test_vqdmlsl_lane_s32 + return vqdmlsl_lane_s32(a, b, v, 1); + // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { + // CHECK: test_vqdmlsl_high_lane_s16 + return vqdmlsl_high_lane_s16(a, b, v, 1); + // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { + // CHECK: test_vqdmlsl_high_lane_s32 + return vqdmlsl_high_lane_s32(a, b, v, 1); + // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) { + // CHECK: test_vqdmull_lane_s16 + return vqdmull_lane_s16(a, v, 1); + // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) { + // CHECK: test_vqdmull_lane_s32 + return vqdmull_lane_s32(a, v, 1); + // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) { + // CHECK: test_vqdmull_laneq_s16 + return vqdmull_laneq_s16(a, v, 1); + // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) { + // CHECK: test_vqdmull_laneq_s32 + return vqdmull_laneq_s32(a, v, 1); + // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) { + // CHECK: test_vqdmull_high_lane_s16 + return vqdmull_high_lane_s16(a, v, 1); + // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) { + // CHECK: test_vqdmull_high_lane_s32 + return vqdmull_high_lane_s32(a, v, 1); + // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) { + // CHECK: test_vqdmull_high_laneq_s16 + return vqdmull_high_laneq_s16(a, v, 1); + // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) { + // CHECK: test_vqdmull_high_laneq_s32 + return vqdmull_high_laneq_s32(a, v, 1); + // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) { + // CHECK: test_vqdmulh_lane_s16 + return vqdmulh_lane_s16(a, v, 1); + // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) { + // CHECK: test_vqdmulhq_lane_s16 + return vqdmulhq_lane_s16(a, v, 1); + // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) { + // CHECK: test_vqdmulh_lane_s32 + return vqdmulh_lane_s32(a, v, 1); + // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) { + // CHECK: test_vqdmulhq_lane_s32 + return vqdmulhq_lane_s32(a, v, 1); + // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) { + // CHECK: test_vqrdmulh_lane_s16 + return vqrdmulh_lane_s16(a, v, 1); + // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[1] +} + +int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) { + // CHECK: test_vqrdmulhq_lane_s16 + return vqrdmulhq_lane_s16(a, v, 1); + // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[1] +} + +int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) { + // CHECK: test_vqrdmulh_lane_s32 + return vqrdmulh_lane_s32(a, v, 1); + // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) { + // CHECK: test_vqrdmulhq_lane_s32 + return vqrdmulhq_lane_s32(a, v, 1); + // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) { + // CHECK: test_vmul_lane_f32 + return vmul_lane_f32(a, v, 1); + // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) { + // CHECK: test_vmulq_lane_f32 + return vmulq_lane_f32(a, v, 1); + // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) { + // CHECK: test_vmulq_lane_f64 + return vmulq_lane_f64(a, v, 0); + // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) { + // CHECK: test_vmul_laneq_f32 + return vmul_laneq_f32(a, v, 1); + // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) { + // CHECK: test_vmulq_laneq_f32 + return vmulq_laneq_f32(a, v, 1); + // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) { + // CHECK: test_vmulq_laneq_f64 + return vmulq_laneq_f64(a, v, 0); + // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) { + // CHECK: test_vmulq_laneq_f64 + return vmulq_laneq_f64(a, v, 1); + // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +} + +float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) { + // CHECK: test_vmulx_lane_f32 + return vmulx_lane_f32(a, v, 1); + // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) { + // CHECK: test_vmulxq_lane_f32 + return vmulxq_lane_f32(a, v, 1); + // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) { + // CHECK: test_vmulxq_lane_f64 + return vmulxq_lane_f64(a, v, 0); + // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) { + // CHECK: test_vmulx_laneq_f32 + return vmulx_laneq_f32(a, v, 1); + // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +} + +float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) { + // CHECK: test_vmulxq_laneq_f32 + return vmulxq_laneq_f32(a, v, 1); + // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +} + +float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) { + // CHECK: test_vmulxq_laneq_f64 + return vmulxq_laneq_f64(a, v, 0); + // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +} + +float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) { + // CHECK: test_vmulxq_laneq_f64 + return vmulxq_laneq_f64(a, v, 1); + // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +} + diff --git a/utils/TableGen/NeonEmitter.cpp b/utils/TableGen/NeonEmitter.cpp index f700c6753a..9dc2d56f6c 100644 --- a/utils/TableGen/NeonEmitter.cpp +++ b/utils/TableGen/NeonEmitter.cpp @@ -62,16 +62,25 @@ enum OpKind { OpMlalN, OpMlslN, OpMulLane, + OpMulXLane, OpMullLane, + OpMullHiLane, OpMlaLane, OpMlsLane, OpMlalLane, + OpMlalHiLane, OpMlslLane, + OpMlslHiLane, OpQDMullLane, + OpQDMullHiLane, OpQDMlalLane, + OpQDMlalHiLane, OpQDMlslLane, + OpQDMlslHiLane, OpQDMulhLane, OpQRDMulhLane, + OpFMSLane, + OpFMSLaneQ, OpEq, OpGe, OpLe, @@ -197,16 +206,25 @@ public: OpMap["OP_MLAL_N"] = OpMlalN; OpMap["OP_MLSL_N"] = OpMlslN; OpMap["OP_MUL_LN"]= OpMulLane; + OpMap["OP_MULX_LN"]= OpMulXLane; OpMap["OP_MULL_LN"] = OpMullLane; + OpMap["OP_MULLHi_LN"] = OpMullHiLane; OpMap["OP_MLA_LN"]= OpMlaLane; OpMap["OP_MLS_LN"]= OpMlsLane; OpMap["OP_MLAL_LN"] = OpMlalLane; + OpMap["OP_MLALHi_LN"] = OpMlalHiLane; OpMap["OP_MLSL_LN"] = OpMlslLane; + OpMap["OP_MLSLHi_LN"] = OpMlslHiLane; OpMap["OP_QDMULL_LN"] = OpQDMullLane; + OpMap["OP_QDMULLHi_LN"] = OpQDMullHiLane; OpMap["OP_QDMLAL_LN"] = OpQDMlalLane; + OpMap["OP_QDMLALHi_LN"] = OpQDMlalHiLane; OpMap["OP_QDMLSL_LN"] = OpQDMlslLane; + OpMap["OP_QDMLSLHi_LN"] = OpQDMlslHiLane; OpMap["OP_QDMULH_LN"] = OpQDMulhLane; OpMap["OP_QRDMULH_LN"] = OpQRDMulhLane; + OpMap["OP_FMS_LN"] = OpFMSLane; + OpMap["OP_FMS_LNQ"] = OpFMSLaneQ; OpMap["OP_EQ"] = OpEq; OpMap["OP_GE"] = OpGe; OpMap["OP_LE"] = OpLe; @@ -447,6 +465,9 @@ static char ModType(const char mod, char type, bool &quad, bool &poly, case 'g': quad = false; break; + case 'j': + quad = true; + break; case 'w': type = Widen(type); quad = true; @@ -626,7 +647,8 @@ static std::string BuiltinTypeString(const char mod, StringRef typestr, type = 's'; usgn = true; } - usgn = usgn | poly | ((ck == ClassI || ck == ClassW) && scal && type != 'f'); + usgn = usgn | poly | ((ck == ClassI || ck == ClassW) && + scal && type != 'f' && type != 'd'); if (scal) { SmallString<128> s; @@ -657,6 +679,8 @@ static std::string BuiltinTypeString(const char mod, StringRef typestr, return "vv*"; // void result with void* first argument if (mod == 'f' || (ck != ClassB && type == 'f')) return quad ? "V4f" : "V2f"; + if (ck != ClassB && type == 'd') + return quad ? "V2d" : "V1d"; if (ck != ClassB && type == 's') return quad ? "V8s" : "V4s"; if (ck != ClassB && type == 'i') @@ -677,6 +701,8 @@ static std::string BuiltinTypeString(const char mod, StringRef typestr, if (mod == 'f' || (ck != ClassB && type == 'f')) return quad ? "V4f" : "V2f"; + if (ck != ClassB && type == 'd') + return quad ? "V2d" : "V1d"; if (ck != ClassB && type == 's') return quad ? "V8s" : "V4s"; if (ck != ClassB && type == 'i') @@ -974,6 +1000,7 @@ static void NormalizeProtoForRegisterPatternCreation(const std::string &Name, NormedProto += 'q'; break; case 'g': + case 'j': case 'h': case 'e': NormedProto += 'd'; @@ -1504,6 +1531,10 @@ static std::string GenOpString(const std::string &name, OpKind op, case OpMulLane: s += "__a * " + SplatLane(nElts, "__b", "__c") + ";"; break; + case OpMulXLane: + s += MangleName("vmulx", typestr, ClassS) + "(__a, " + + SplatLane(nElts, "__b", "__c") + ");"; + break; case OpMul: s += "__a * __b;"; break; @@ -1511,6 +1542,10 @@ static std::string GenOpString(const std::string &name, OpKind op, s += MangleName("vmull", typestr, ClassS) + "(__a, " + SplatLane(nElts, "__b", "__c") + ");"; break; + case OpMullHiLane: + s += MangleName("vmull", typestr, ClassS) + "(" + + GetHigh("__a", typestr) + ", " + SplatLane(nElts, "__b", "__c") + ");"; + break; case OpMlaN: s += "__a + (__b * " + Duplicate(nElts, typestr, "__c") + ");"; break; @@ -1528,6 +1563,10 @@ static std::string GenOpString(const std::string &name, OpKind op, s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, " + SplatLane(nElts, "__c", "__d") + ");"; break; + case OpMlalHiLane: + s += "__a + " + MangleName("vmull", typestr, ClassS) + "(" + + GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");"; + break; case OpMlal: s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, __c);"; break; @@ -1543,6 +1582,18 @@ static std::string GenOpString(const std::string &name, OpKind op, case OpMlsLane: s += "__a - (__b * " + SplatLane(nElts, "__c", "__d") + ");"; break; + case OpFMSLane: + s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n "; + s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n "; + s += TypeString(proto[3], typestr) + " __c1 = __c; \\\n "; + s += MangleName("vfma_lane", typestr, ClassS) + "(__a1, __b1, -__c1, __d);"; + break; + case OpFMSLaneQ: + s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n "; + s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n "; + s += TypeString(proto[3], typestr) + " __c1 = __c; \\\n "; + s += MangleName("vfma_laneq", typestr, ClassS) + "(__a1, __b1, -__c1, __d);"; + break; case OpMls: s += "__a - (__b * __c);"; break; @@ -1554,6 +1605,10 @@ static std::string GenOpString(const std::string &name, OpKind op, s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, " + SplatLane(nElts, "__c", "__d") + ");"; break; + case OpMlslHiLane: + s += "__a - " + MangleName("vmull", typestr, ClassS) + "(" + + GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");"; + break; case OpMlsl: s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, __c);"; break; @@ -1564,14 +1619,26 @@ static std::string GenOpString(const std::string &name, OpKind op, s += MangleName("vqdmull", typestr, ClassS) + "(__a, " + SplatLane(nElts, "__b", "__c") + ");"; break; + case OpQDMullHiLane: + s += MangleName("vqdmull", typestr, ClassS) + "(" + + GetHigh("__a", typestr) + ", " + SplatLane(nElts, "__b", "__c") + ");"; + break; case OpQDMlalLane: s += MangleName("vqdmlal", typestr, ClassS) + "(__a, __b, " + SplatLane(nElts, "__c", "__d") + ");"; break; + case OpQDMlalHiLane: + s += MangleName("vqdmlal", typestr, ClassS) + "(__a, " + + GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");"; + break; case OpQDMlslLane: s += MangleName("vqdmlsl", typestr, ClassS) + "(__a, __b, " + SplatLane(nElts, "__c", "__d") + ");"; break; + case OpQDMlslHiLane: + s += MangleName("vqdmlsl", typestr, ClassS) + "(__a, " + + GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");"; + break; case OpQDMulhLane: s += MangleName("vqdmulh", typestr, ClassS) + "(__a, " + SplatLane(nElts, "__b", "__c") + ");"; @@ -2072,20 +2139,28 @@ void NeonEmitter::run(raw_ostream &OS) { // Emit Neon vector typedefs. std::string TypedefTypes( - "cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfQdPcQPcPsQPs"); + "cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfdQdPcQPcPsQPs"); SmallVector TDTypeVec; ParseTypes(0, TypedefTypes, TDTypeVec); // Emit vector typedefs. + bool isA64 = false; for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) { bool dummy, quad = false, poly = false; char type = ClassifyType(TDTypeVec[i], quad, poly, dummy); - bool isA64 = false; + bool preinsert = false; + bool postinsert = false; - if (type == 'd' && quad) + if (type == 'd') { + preinsert = isA64? false: true; isA64 = true; - - if (isA64) + } else { + postinsert = isA64? true: false; + isA64 = false; + } + if (postinsert) + OS << "#endif\n"; + if (preinsert) OS << "#ifdef __aarch64__\n"; if (poly) @@ -2101,22 +2176,28 @@ void NeonEmitter::run(raw_ostream &OS) { OS << TypeString('s', TDTypeVec[i]); OS << " " << TypeString('d', TDTypeVec[i]) << ";\n"; - if (isA64) - OS << "#endif\n"; } OS << "\n"; // Emit struct typedefs. + isA64 = false; for (unsigned vi = 2; vi != 5; ++vi) { for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) { bool dummy, quad = false, poly = false; char type = ClassifyType(TDTypeVec[i], quad, poly, dummy); - bool isA64 = false; + bool preinsert = false; + bool postinsert = false; - if (type == 'd' && quad) + if (type == 'd') { + preinsert = isA64? false: true; isA64 = true; - - if (isA64) + } else { + postinsert = isA64? true: false; + isA64 = false; + } + if (postinsert) + OS << "#endif\n"; + if (preinsert) OS << "#ifdef __aarch64__\n"; std::string ts = TypeString('d', TDTypeVec[i]); @@ -2126,10 +2207,6 @@ void NeonEmitter::run(raw_ostream &OS) { OS << "[" << utostr(vi) << "]"; OS << ";\n} "; OS << vs << ";\n"; - - if (isA64) - OS << "#endif\n"; - OS << "\n"; } } @@ -2255,6 +2332,7 @@ static unsigned RangeFromType(const char mod, StringRef typestr) { case 'f': case 'i': return (2 << (int)quad) - 1; + case 'd': case 'l': return (1 << (int)quad) - 1; default: