From: Amaury Sechet Date: Thu, 22 Aug 2019 18:53:41 +0000 (+0000) Subject: [AArch64] autogenerate some tests. NFC X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c89a2e38905edc7ebc51e1dcf83436147ef70d12;p=llvm [AArch64] autogenerate some tests. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@369685 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/test/CodeGen/AArch64/arm64-vmul.ll b/test/CodeGen/AArch64/arm64-vmul.ll index 6d795dbcff8..bfb4183f4da 100644 --- a/test/CodeGen/AArch64/arm64-vmul.ll +++ b/test/CodeGen/AArch64/arm64-vmul.ll @@ -1,9 +1,13 @@ -; RUN: llc < %s -asm-verbose=false -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s - +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: smull8h: -;CHECK: smull.8h +; CHECK-LABEL: smull8h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: smull.8h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) @@ -11,8 +15,12 @@ define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { } define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: smull4s: -;CHECK: smull.4s +; CHECK-LABEL: smull4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: smull.4s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -20,8 +28,12 @@ define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <2 x i64> @smull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: smull2d: -;CHECK: smull.2d +; CHECK-LABEL: smull2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: smull.2d v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -33,8 +45,12 @@ declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: umull8h: -;CHECK: umull.8h +; CHECK-LABEL: umull8h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: umull.8h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) @@ -42,8 +58,12 @@ define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { } define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: umull4s: -;CHECK: umull.4s +; CHECK-LABEL: umull4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: umull.4s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -51,8 +71,12 @@ define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <2 x i64> @umull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: umull2d: -;CHECK: umull.2d +; CHECK-LABEL: umull2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: umull.2d v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -64,8 +88,12 @@ declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: sqdmull4s: -;CHECK: sqdmull.4s +; CHECK-LABEL: sqdmull4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: sqdmull.4s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -73,8 +101,12 @@ define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: sqdmull2d: -;CHECK: sqdmull.2d +; CHECK-LABEL: sqdmull2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: sqdmull.2d v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -82,8 +114,12 @@ define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { } define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: sqdmull2_4s: -;CHECK: sqdmull.4s +; CHECK-LABEL: sqdmull2_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0, #8] +; CHECK-NEXT: ldr d1, [x1, #8] +; CHECK-NEXT: sqdmull.4s v0, v0, v1 +; CHECK-NEXT: ret %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> @@ -93,8 +129,12 @@ define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { } define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: sqdmull2_2d: -;CHECK: sqdmull.2d +; CHECK-LABEL: sqdmull2_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0, #8] +; CHECK-NEXT: ldr d1, [x1, #8] +; CHECK-NEXT: sqdmull.2d v0, v0, v1 +; CHECK-NEXT: ret %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> @@ -108,8 +148,12 @@ declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwin declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: pmull8h: -;CHECK: pmull.8h +; CHECK-LABEL: pmull8h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: pmull.8h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) @@ -119,8 +163,12 @@ define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind { declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: sqdmulh_4h: -;CHECK: sqdmulh.4h +; CHECK-LABEL: sqdmulh_4h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: sqdmulh.4h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -128,8 +176,12 @@ define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: sqdmulh_8h: -;CHECK: sqdmulh.8h +; CHECK-LABEL: sqdmulh_8h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: sqdmulh.8h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) @@ -137,8 +189,12 @@ define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { } define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: sqdmulh_2s: -;CHECK: sqdmulh.2s +; CHECK-LABEL: sqdmulh_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: sqdmulh.2s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -146,8 +202,12 @@ define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { } define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: sqdmulh_4s: -;CHECK: sqdmulh.4s +; CHECK-LABEL: sqdmulh_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: sqdmulh.4s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) @@ -155,8 +215,15 @@ define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { } define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind { -;CHECK-LABEL: sqdmulh_1s: -;CHECK: sqdmulh s0, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-LABEL: sqdmulh_1s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: ldr w9, [x1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: sqdmulh s0, s0, s1 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret %tmp1 = load i32, i32* %A %tmp2 = load i32, i32* %B %tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2) @@ -170,8 +237,12 @@ declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwin declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: sqrdmulh_4h: -;CHECK: sqrdmulh.4h +; CHECK-LABEL: sqrdmulh_4h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: sqrdmulh.4h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -179,8 +250,12 @@ define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: sqrdmulh_8h: -;CHECK: sqrdmulh.8h +; CHECK-LABEL: sqrdmulh_8h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: sqrdmulh.8h v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) @@ -188,8 +263,12 @@ define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { } define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: sqrdmulh_2s: -;CHECK: sqrdmulh.2s +; CHECK-LABEL: sqrdmulh_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: sqrdmulh.2s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -197,8 +276,12 @@ define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { } define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: sqrdmulh_4s: -;CHECK: sqrdmulh.4s +; CHECK-LABEL: sqrdmulh_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: sqrdmulh.4s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) @@ -206,8 +289,15 @@ define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { } define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind { -;CHECK-LABEL: sqrdmulh_1s: -;CHECK: sqrdmulh s0, {{s[0-9]+}}, {{s[0-9]+}} +; CHECK-LABEL: sqrdmulh_1s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: ldr w9, [x1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: sqrdmulh s0, s0, s1 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret %tmp1 = load i32, i32* %A %tmp2 = load i32, i32* %B %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2) @@ -221,8 +311,12 @@ declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwi declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind { -;CHECK-LABEL: fmulx_2s: -;CHECK: fmulx.2s +; CHECK-LABEL: fmulx_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: fmulx.2s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = load <2 x float>, <2 x float>* %B %tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) @@ -230,8 +324,12 @@ define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind { } define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind { -;CHECK-LABEL: fmulx_4s: -;CHECK: fmulx.4s +; CHECK-LABEL: fmulx_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: fmulx.4s v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = load <4 x float>, <4 x float>* %B %tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) @@ -239,8 +337,12 @@ define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind { } define <2 x double> @fmulx_2d(<2 x double>* %A, <2 x double>* %B) nounwind { -;CHECK-LABEL: fmulx_2d: -;CHECK: fmulx.2d +; CHECK-LABEL: fmulx_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: fmulx.2d v0, v0, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x double>, <2 x double>* %A %tmp2 = load <2 x double>, <2 x double>* %B %tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2) @@ -252,8 +354,13 @@ declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nou declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { -;CHECK-LABEL: smlal4s: -;CHECK: smlal.4s +; CHECK-LABEL: smlal4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: smlal.4s v0, v1, v2 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -263,8 +370,13 @@ define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind } define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { -;CHECK-LABEL: smlal2d: -;CHECK: smlal.2d +; CHECK-LABEL: smlal2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: smlal.2d v0, v1, v2 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -274,8 +386,13 @@ define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind } define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { -;CHECK-LABEL: smlsl4s: -;CHECK: smlsl.4s +; CHECK-LABEL: smlsl4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: smlsl.4s v0, v1, v2 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -285,8 +402,13 @@ define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind } define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { -;CHECK-LABEL: smlsl2d: -;CHECK: smlsl.2d +; CHECK-LABEL: smlsl2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: smlsl.2d v0, v1, v2 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -301,8 +423,13 @@ declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { -;CHECK-LABEL: sqdmlal4s: -;CHECK: sqdmlal.4s +; CHECK-LABEL: sqdmlal4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: sqdmlal.4s v0, v1, v2 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -312,8 +439,13 @@ define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwin } define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { -;CHECK-LABEL: sqdmlal2d: -;CHECK: sqdmlal.2d +; CHECK-LABEL: sqdmlal2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: sqdmlal.2d v0, v1, v2 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -323,8 +455,13 @@ define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwin } define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { -;CHECK-LABEL: sqdmlal2_4s: -;CHECK: sqdmlal.4s +; CHECK-LABEL: sqdmlal2_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: ldr d1, [x0, #8] +; CHECK-NEXT: ldr d2, [x1, #8] +; CHECK-NEXT: sqdmlal.4s v0, v1, v2 +; CHECK-NEXT: ret %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -336,8 +473,13 @@ define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounw } define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { -;CHECK-LABEL: sqdmlal2_2d: -;CHECK: sqdmlal.2d +; CHECK-LABEL: sqdmlal2_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: ldr d1, [x0, #8] +; CHECK-NEXT: ldr d2, [x1, #8] +; CHECK-NEXT: sqdmlal.2d v0, v1, v2 +; CHECK-NEXT: ret %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -349,8 +491,13 @@ define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounw } define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { -;CHECK-LABEL: sqdmlsl4s: -;CHECK: sqdmlsl.4s +; CHECK-LABEL: sqdmlsl4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: sqdmlsl.4s v0, v1, v2 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -360,8 +507,13 @@ define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwin } define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { -;CHECK-LABEL: sqdmlsl2d: -;CHECK: sqdmlsl.2d +; CHECK-LABEL: sqdmlsl2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: sqdmlsl.2d v0, v1, v2 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -371,8 +523,13 @@ define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwin } define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { -;CHECK-LABEL: sqdmlsl2_4s: -;CHECK: sqdmlsl.4s +; CHECK-LABEL: sqdmlsl2_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: ldr d1, [x0, #8] +; CHECK-NEXT: ldr d2, [x1, #8] +; CHECK-NEXT: sqdmlsl.4s v0, v1, v2 +; CHECK-NEXT: ret %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -384,8 +541,13 @@ define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounw } define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { -;CHECK-LABEL: sqdmlsl2_2d: -;CHECK: sqdmlsl.2d +; CHECK-LABEL: sqdmlsl2_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: ldr d1, [x0, #8] +; CHECK-NEXT: ldr d2, [x1, #8] +; CHECK-NEXT: sqdmlsl.2d v0, v1, v2 +; CHECK-NEXT: ret %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -397,8 +559,13 @@ define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounw } define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { -;CHECK-LABEL: umlal4s: -;CHECK: umlal.4s +; CHECK-LABEL: umlal4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: umlal.4s v0, v1, v2 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -408,8 +575,13 @@ define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind } define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { -;CHECK-LABEL: umlal2d: -;CHECK: umlal.2d +; CHECK-LABEL: umlal2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: umlal.2d v0, v1, v2 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -419,8 +591,13 @@ define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind } define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { -;CHECK-LABEL: umlsl4s: -;CHECK: umlsl.4s +; CHECK-LABEL: umlsl4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: umlsl.4s v0, v1, v2 +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -430,8 +607,13 @@ define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind } define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { -;CHECK-LABEL: umlsl2d: -;CHECK: umlsl.2d +; CHECK-LABEL: umlsl2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: umlsl.2d v0, v1, v2 +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -441,8 +623,13 @@ define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind } define <2 x float> @fmla_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind { -;CHECK-LABEL: fmla_2s: -;CHECK: fmla.2s +; CHECK-LABEL: fmla_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d0, [x2] +; CHECK-NEXT: fmla.2s v0, v2, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = load <2 x float>, <2 x float>* %B %tmp3 = load <2 x float>, <2 x float>* %C @@ -451,8 +638,13 @@ define <2 x float> @fmla_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) n } define <4 x float> @fmla_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { -;CHECK-LABEL: fmla_4s: -;CHECK: fmla.4s +; CHECK-LABEL: fmla_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: fmla.4s v0, v2, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = load <4 x float>, <4 x float>* %B %tmp3 = load <4 x float>, <4 x float>* %C @@ -461,8 +653,13 @@ define <4 x float> @fmla_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) n } define <2 x double> @fmla_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind { -;CHECK-LABEL: fmla_2d: -;CHECK: fmla.2d +; CHECK-LABEL: fmla_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: fmla.2d v0, v2, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x double>, <2 x double>* %A %tmp2 = load <2 x double>, <2 x double>* %B %tmp3 = load <2 x double>, <2 x double>* %C @@ -475,8 +672,13 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounw declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone define <2 x float> @fmls_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind { -;CHECK-LABEL: fmls_2s: -;CHECK: fmls.2s +; CHECK-LABEL: fmls_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d0, [x2] +; CHECK-NEXT: fmls.2s v0, v1, v2 +; CHECK-NEXT: ret %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = load <2 x float>, <2 x float>* %B %tmp3 = load <2 x float>, <2 x float>* %C @@ -486,8 +688,13 @@ define <2 x float> @fmls_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) n } define <4 x float> @fmls_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { -;CHECK-LABEL: fmls_4s: -;CHECK: fmls.4s +; CHECK-LABEL: fmls_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: fmls.4s v0, v1, v2 +; CHECK-NEXT: ret %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = load <4 x float>, <4 x float>* %B %tmp3 = load <4 x float>, <4 x float>* %C @@ -497,8 +704,13 @@ define <4 x float> @fmls_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) n } define <2 x double> @fmls_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind { -;CHECK-LABEL: fmls_2d: -;CHECK: fmls.2d +; CHECK-LABEL: fmls_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: fmls.2d v0, v1, v2 +; CHECK-NEXT: ret %tmp1 = load <2 x double>, <2 x double>* %A %tmp2 = load <2 x double>, <2 x double>* %B %tmp3 = load <2 x double>, <2 x double>* %C @@ -508,8 +720,13 @@ define <2 x double> @fmls_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* % } define <2 x float> @fmls_commuted_neg_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind { -;CHECK-LABEL: fmls_commuted_neg_2s: -;CHECK: fmls.2s +; CHECK-LABEL: fmls_commuted_neg_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr d0, [x2] +; CHECK-NEXT: fmls.2s v0, v2, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = load <2 x float>, <2 x float>* %B %tmp3 = load <2 x float>, <2 x float>* %C @@ -519,8 +736,13 @@ define <2 x float> @fmls_commuted_neg_2s(<2 x float>* %A, <2 x float>* %B, <2 x } define <4 x float> @fmls_commuted_neg_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind { -;CHECK-LABEL: fmls_commuted_neg_4s: -;CHECK: fmls.4s +; CHECK-LABEL: fmls_commuted_neg_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: fmls.4s v0, v2, v1 +; CHECK-NEXT: ret %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = load <4 x float>, <4 x float>* %B %tmp3 = load <4 x float>, <4 x float>* %C @@ -530,8 +752,13 @@ define <4 x float> @fmls_commuted_neg_4s(<4 x float>* %A, <4 x float>* %B, <4 x } define <2 x double> @fmls_commuted_neg_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind { -;CHECK-LABEL: fmls_commuted_neg_2d: -;CHECK: fmls.2d +; CHECK-LABEL: fmls_commuted_neg_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q1, [x0] +; CHECK-NEXT: ldr q2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: fmls.2d v0, v2, v1 +; CHECK-NEXT: ret %tmp1 = load <2 x double>, <2 x double>* %A %tmp2 = load <2 x double>, <2 x double>* %B %tmp3 = load <2 x double>, <2 x double>* %C @@ -541,8 +768,11 @@ define <2 x double> @fmls_commuted_neg_2d(<2 x double>* %A, <2 x double>* %B, <2 } define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp { -;CHECK-LABEL: fmls_indexed_2s: -;CHECK: fmls.2s +; CHECK-LABEL: fmls_indexed_2s: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmls.2s v0, v2, v1[0] +; CHECK-NEXT: ret entry: %0 = fsub <2 x float> , %c %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer @@ -551,8 +781,10 @@ entry: } define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp { -;CHECK-LABEL: fmls_indexed_4s: -;CHECK: fmls.4s +; CHECK-LABEL: fmls_indexed_4s: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls.4s v0, v2, v1[0] +; CHECK-NEXT: ret entry: %0 = fsub <4 x float> , %c %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer @@ -561,8 +793,10 @@ entry: } define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp { -;CHECK-LABEL: fmls_indexed_2d: -;CHECK: fmls.2d +; CHECK-LABEL: fmls_indexed_2d: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls.2d v0, v2, v1[0] +; CHECK-NEXT: ret entry: %0 = fsub <2 x double> , %c %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer @@ -571,10 +805,12 @@ entry: } define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp { -entry: ; CHECK-LABEL: fmla_indexed_scalar_2s: -; CHECK-NEXT: fmla.2s -; CHECK-NEXT: ret +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $s2 killed $s2 def $d2 +; CHECK-NEXT: fmla.2s v0, v1, v2 +; CHECK-NEXT: ret +entry: %v1 = insertelement <2 x float> undef, float %c, i32 0 %v2 = insertelement <2 x float> %v1, float %c, i32 1 %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind @@ -582,10 +818,12 @@ entry: } define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp { -entry: ; CHECK-LABEL: fmla_indexed_scalar_4s: -; CHECK-NEXT: fmla.4s -; CHECK-NEXT: ret +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 +; CHECK-NEXT: fmla.4s v0, v1, v2[0] +; CHECK-NEXT: ret +entry: %v1 = insertelement <4 x float> undef, float %c, i32 0 %v2 = insertelement <4 x float> %v1, float %c, i32 1 %v3 = insertelement <4 x float> %v2, float %c, i32 2 @@ -596,8 +834,10 @@ entry: define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp { ; CHECK-LABEL: fmla_indexed_scalar_2d: -; CHECK-NEXT: fmla.2d -; CHECK-NEXT: ret +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmla.2d v0, v1, v2[0] +; CHECK-NEXT: ret entry: %v1 = insertelement <2 x double> undef, double %c, i32 0 %v2 = insertelement <2 x double> %v1, double %c, i32 1 @@ -606,9 +846,12 @@ entry: } define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: mul_4h: -;CHECK-NOT: dup -;CHECK: mul.4h +; CHECK-LABEL: mul_4h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: mul.4h v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> @@ -617,9 +860,12 @@ define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <8 x i16> @mul_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: mul_8h: -;CHECK-NOT: dup -;CHECK: mul.8h +; CHECK-LABEL: mul_8h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: mul.8h v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> @@ -628,9 +874,12 @@ define <8 x i16> @mul_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { } define <2 x i32> @mul_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: mul_2s: -;CHECK-NOT: dup -;CHECK: mul.2s +; CHECK-LABEL: mul_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: mul.2s v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> @@ -639,9 +888,12 @@ define <2 x i32> @mul_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { } define <4 x i32> @mul_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: mul_4s: -;CHECK-NOT: dup -;CHECK: mul.4s +; CHECK-LABEL: mul_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: mul.4s v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> @@ -651,16 +903,27 @@ define <4 x i32> @mul_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind { ; CHECK-LABEL: mul_2d: -; CHECK: mul -; CHECK: mul +; CHECK: // %bb.0: +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: mov.d x8, v1[1] +; CHECK-NEXT: mov.d x9, v0[1] +; CHECK-NEXT: mul x10, x11, x10 +; CHECK-NEXT: mul x8, x9, x8 +; CHECK-NEXT: fmov d0, x10 +; CHECK-NEXT: mov.d v0[1], x8 +; CHECK-NEXT: ret %tmp1 = mul <2 x i64> %A, %B ret <2 x i64> %tmp1 } define <2 x float> @fmul_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind { -;CHECK-LABEL: fmul_lane_2s: -;CHECK-NOT: dup -;CHECK: fmul.2s +; CHECK-LABEL: fmul_lane_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: fmul.2s v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = load <2 x float>, <2 x float>* %B %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> @@ -669,9 +932,12 @@ define <2 x float> @fmul_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind { } define <4 x float> @fmul_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind { -;CHECK-LABEL: fmul_lane_4s: -;CHECK-NOT: dup -;CHECK: fmul.4s +; CHECK-LABEL: fmul_lane_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: fmul.4s v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = load <4 x float>, <4 x float>* %B %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> @@ -680,9 +946,12 @@ define <4 x float> @fmul_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind { } define <2 x double> @fmul_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind { -;CHECK-LABEL: fmul_lane_2d: -;CHECK-NOT: dup -;CHECK: fmul.2d +; CHECK-LABEL: fmul_lane_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: fmul.2d v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <2 x double>, <2 x double>* %A %tmp2 = load <2 x double>, <2 x double>* %B %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> @@ -691,18 +960,20 @@ define <2 x double> @fmul_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind { } define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind { -;CHECK-LABEL: fmul_lane_s: -;CHECK-NOT: dup -;CHECK: fmul.s s0, s0, v1[3] +; CHECK-LABEL: fmul_lane_s: +; CHECK: // %bb.0: +; CHECK-NEXT: fmul.s s0, s0, v1[3] +; CHECK-NEXT: ret %B = extractelement <4 x float> %vec, i32 3 %res = fmul float %A, %B ret float %res } define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind { -;CHECK-LABEL: fmul_lane_d: -;CHECK-NOT: dup -;CHECK: fmul.d d0, d0, v1[1] +; CHECK-LABEL: fmul_lane_d: +; CHECK: // %bb.0: +; CHECK-NEXT: fmul.d d0, d0, v1[1] +; CHECK-NEXT: ret %B = extractelement <2 x double> %vec, i32 1 %res = fmul double %A, %B ret double %res @@ -711,9 +982,12 @@ define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind { define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind { -;CHECK-LABEL: fmulx_lane_2s: -;CHECK-NOT: dup -;CHECK: fmulx.2s +; CHECK-LABEL: fmulx_lane_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: fmulx.2s v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = load <2 x float>, <2 x float>* %B %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> @@ -722,9 +996,12 @@ define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind { } define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind { -;CHECK-LABEL: fmulx_lane_4s: -;CHECK-NOT: dup -;CHECK: fmulx.4s +; CHECK-LABEL: fmulx_lane_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: fmulx.4s v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = load <4 x float>, <4 x float>* %B %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> @@ -733,9 +1010,12 @@ define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind { } define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind { -;CHECK-LABEL: fmulx_lane_2d: -;CHECK-NOT: dup -;CHECK: fmulx.2d +; CHECK-LABEL: fmulx_lane_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: fmulx.2d v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <2 x double>, <2 x double>* %A %tmp2 = load <2 x double>, <2 x double>* %B %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> @@ -744,9 +1024,12 @@ define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind } define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: sqdmulh_lane_4h: -;CHECK-NOT: dup -;CHECK: sqdmulh.4h +; CHECK-LABEL: sqdmulh_lane_4h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: sqdmulh.4h v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> @@ -755,9 +1038,12 @@ define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: sqdmulh_lane_8h: -;CHECK-NOT: dup -;CHECK: sqdmulh.8h +; CHECK-LABEL: sqdmulh_lane_8h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: sqdmulh.8h v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> @@ -766,9 +1052,12 @@ define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { } define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: sqdmulh_lane_2s: -;CHECK-NOT: dup -;CHECK: sqdmulh.2s +; CHECK-LABEL: sqdmulh_lane_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: sqdmulh.2s v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> @@ -777,9 +1066,12 @@ define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { } define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: sqdmulh_lane_4s: -;CHECK-NOT: dup -;CHECK: sqdmulh.4s +; CHECK-LABEL: sqdmulh_lane_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: sqdmulh.4s v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> @@ -788,18 +1080,24 @@ define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { } define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind { -;CHECK-LABEL: sqdmulh_lane_1s: -;CHECK-NOT: dup -;CHECK: sqdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1] +; CHECK-LABEL: sqdmulh_lane_1s: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqdmulh.s s0, s1, v0[1] +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret %tmp1 = extractelement <4 x i32> %B, i32 1 %tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1) ret i32 %tmp2 } define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: sqrdmulh_lane_4h: -;CHECK-NOT: dup -;CHECK: sqrdmulh.4h +; CHECK-LABEL: sqrdmulh_lane_4h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: sqrdmulh.4h v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> @@ -808,9 +1106,12 @@ define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: sqrdmulh_lane_8h: -;CHECK-NOT: dup -;CHECK: sqrdmulh.8h +; CHECK-LABEL: sqrdmulh_lane_8h: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: sqrdmulh.8h v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> @@ -819,9 +1120,12 @@ define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind { } define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: sqrdmulh_lane_2s: -;CHECK-NOT: dup -;CHECK: sqrdmulh.2s +; CHECK-LABEL: sqrdmulh_lane_2s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: sqrdmulh.2s v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> @@ -830,9 +1134,12 @@ define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind { } define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: sqrdmulh_lane_4s: -;CHECK-NOT: dup -;CHECK: sqrdmulh.4s +; CHECK-LABEL: sqrdmulh_lane_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: sqrdmulh.4s v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> @@ -841,18 +1148,24 @@ define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind { } define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind { -;CHECK-LABEL: sqrdmulh_lane_1s: -;CHECK-NOT: dup -;CHECK: sqrdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1] +; CHECK-LABEL: sqrdmulh_lane_1s: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: sqrdmulh.s s0, s1, v0[1] +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret %tmp1 = extractelement <4 x i32> %B, i32 1 %tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1) ret i32 %tmp2 } define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: sqdmull_lane_4s: -;CHECK-NOT: dup -;CHECK: sqdmull.4s +; CHECK-LABEL: sqdmull_lane_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: sqdmull.4s v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> @@ -861,9 +1174,12 @@ define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: sqdmull_lane_2d: -;CHECK-NOT: dup -;CHECK: sqdmull.2d +; CHECK-LABEL: sqdmull_lane_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: sqdmull.2d v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> @@ -872,9 +1188,12 @@ define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { } define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: sqdmull2_lane_4s: -;CHECK-NOT: dup -;CHECK: sqdmull.4s +; CHECK-LABEL: sqdmull2_lane_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0, #8] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: sqdmull.4s v0, v0, v1[1] +; CHECK-NEXT: ret %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> @@ -884,9 +1203,12 @@ define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind { } define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: sqdmull2_lane_2d: -;CHECK-NOT: dup -;CHECK: sqdmull.2d +; CHECK-LABEL: sqdmull2_lane_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0, #8] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: sqdmull.2d v0, v0, v1[1] +; CHECK-NEXT: ret %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> @@ -896,9 +1218,12 @@ define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind { } define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: umull_lane_4s: -;CHECK-NOT: dup -;CHECK: umull.4s +; CHECK-LABEL: umull_lane_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: umull.4s v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> @@ -907,9 +1232,12 @@ define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: umull_lane_2d: -;CHECK-NOT: dup -;CHECK: umull.2d +; CHECK-LABEL: umull_lane_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: umull.2d v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> @@ -918,9 +1246,12 @@ define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { } define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: smull_lane_4s: -;CHECK-NOT: dup -;CHECK: smull.4s +; CHECK-LABEL: smull_lane_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: smull.4s v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> @@ -929,9 +1260,12 @@ define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind { } define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: smull_lane_2d: -;CHECK-NOT: dup -;CHECK: smull.2d +; CHECK-LABEL: smull_lane_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: smull.2d v0, v0, v1[1] +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> @@ -940,9 +1274,13 @@ define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind { } define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { -;CHECK-LABEL: smlal_lane_4s: -;CHECK-NOT: dup -;CHECK: smlal.4s +; CHECK-LABEL: smlal_lane_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: smlal.4s v0, v1, v2[1] +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -953,9 +1291,13 @@ define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nou } define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { -;CHECK-LABEL: smlal_lane_2d: -;CHECK-NOT: dup -;CHECK: smlal.2d +; CHECK-LABEL: smlal_lane_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: smlal.2d v0, v1, v2[1] +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -966,9 +1308,13 @@ define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nou } define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { -;CHECK-LABEL: sqdmlal_lane_4s: -;CHECK-NOT: dup -;CHECK: sqdmlal.4s +; CHECK-LABEL: sqdmlal_lane_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: sqdmlal.4s v0, v1, v2[1] +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -979,9 +1325,13 @@ define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) n } define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { -;CHECK-LABEL: sqdmlal_lane_2d: -;CHECK-NOT: dup -;CHECK: sqdmlal.2d +; CHECK-LABEL: sqdmlal_lane_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: sqdmlal.2d v0, v1, v2[1] +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -992,9 +1342,13 @@ define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) n } define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { -;CHECK-LABEL: sqdmlal2_lane_4s: -;CHECK-NOT: dup -;CHECK: sqdmlal.4s +; CHECK-LABEL: sqdmlal2_lane_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: ldr d1, [x0, #8] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: sqdmlal.4s v0, v1, v2[1] +; CHECK-NEXT: ret %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -1006,9 +1360,13 @@ define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) } define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { -;CHECK-LABEL: sqdmlal2_lane_2d: -;CHECK-NOT: dup -;CHECK: sqdmlal.2d +; CHECK-LABEL: sqdmlal2_lane_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: ldr d1, [x0, #8] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: sqdmlal.2d v0, v1, v2[1] +; CHECK-NEXT: ret %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -1020,8 +1378,14 @@ define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) } define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind { -;CHECK-LABEL: sqdmlal_lane_1s: -;CHECK: sqdmlal.4s +; CHECK-LABEL: sqdmlal_lane_1s: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s1, w1 +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: sqdmlal.4s v2, v1, v0[1] +; CHECK-NEXT: fmov w0, s2 +; CHECK-NEXT: ret %lhs = insertelement <4 x i16> undef, i16 %B, i32 0 %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs) @@ -1032,8 +1396,14 @@ define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind { declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind { -;CHECK-LABEL: sqdmlsl_lane_1s: -;CHECK: sqdmlsl.4s +; CHECK-LABEL: sqdmlsl_lane_1s: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s1, w1 +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: sqdmlsl.4s v2, v1, v0[1] +; CHECK-NEXT: fmov w0, s2 +; CHECK-NEXT: ret %lhs = insertelement <4 x i16> undef, i16 %B, i32 0 %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs) @@ -1044,8 +1414,14 @@ define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind { declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind { -;CHECK-LABEL: sqdmlal_lane_1d: -;CHECK: sqdmlal.s +; CHECK-LABEL: sqdmlal_lane_1d: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d1, x0 +; CHECK-NEXT: fmov s2, w1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: sqdmlal.s d1, s2, v0[1] +; CHECK-NEXT: fmov x0, d1 +; CHECK-NEXT: ret %rhs = extractelement <2 x i32> %C, i32 1 %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs) %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod) @@ -1055,8 +1431,14 @@ declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32) declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64) define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind { -;CHECK-LABEL: sqdmlsl_lane_1d: -;CHECK: sqdmlsl.s +; CHECK-LABEL: sqdmlsl_lane_1d: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d1, x0 +; CHECK-NEXT: fmov s2, w1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: sqdmlsl.s d1, s2, v0[1] +; CHECK-NEXT: fmov x0, d1 +; CHECK-NEXT: ret %rhs = extractelement <2 x i32> %C, i32 1 %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs) %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod) @@ -1066,9 +1448,13 @@ declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64) define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { -;CHECK-LABEL: umlal_lane_4s: -;CHECK-NOT: dup -;CHECK: umlal.4s +; CHECK-LABEL: umlal_lane_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: umlal.4s v0, v1, v2[1] +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -1079,9 +1465,13 @@ define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nou } define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { -;CHECK-LABEL: umlal_lane_2d: -;CHECK-NOT: dup -;CHECK: umlal.2d +; CHECK-LABEL: umlal_lane_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: umlal.2d v0, v1, v2[1] +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -1093,9 +1483,13 @@ define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nou define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { -;CHECK-LABEL: smlsl_lane_4s: -;CHECK-NOT: dup -;CHECK: smlsl.4s +; CHECK-LABEL: smlsl_lane_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: smlsl.4s v0, v1, v2[1] +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -1106,9 +1500,13 @@ define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nou } define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { -;CHECK-LABEL: smlsl_lane_2d: -;CHECK-NOT: dup -;CHECK: smlsl.2d +; CHECK-LABEL: smlsl_lane_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: smlsl.2d v0, v1, v2[1] +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -1119,9 +1517,13 @@ define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nou } define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { -;CHECK-LABEL: sqdmlsl_lane_4s: -;CHECK-NOT: dup -;CHECK: sqdmlsl.4s +; CHECK-LABEL: sqdmlsl_lane_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: sqdmlsl.4s v0, v1, v2[1] +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -1132,9 +1534,13 @@ define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) n } define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { -;CHECK-LABEL: sqdmlsl_lane_2d: -;CHECK-NOT: dup -;CHECK: sqdmlsl.2d +; CHECK-LABEL: sqdmlsl_lane_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: sqdmlsl.2d v0, v1, v2[1] +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -1145,9 +1551,13 @@ define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) n } define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind { -;CHECK-LABEL: sqdmlsl2_lane_4s: -;CHECK-NOT: dup -;CHECK: sqdmlsl.4s +; CHECK-LABEL: sqdmlsl2_lane_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: ldr d1, [x0, #8] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: sqdmlsl.4s v0, v1, v2[1] +; CHECK-NEXT: ret %load1 = load <8 x i16>, <8 x i16>* %A %load2 = load <8 x i16>, <8 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -1159,9 +1569,13 @@ define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) } define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind { -;CHECK-LABEL: sqdmlsl2_lane_2d: -;CHECK-NOT: dup -;CHECK: sqdmlsl.2d +; CHECK-LABEL: sqdmlsl2_lane_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: ldr d1, [x0, #8] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: sqdmlsl.2d v0, v1, v2[1] +; CHECK-NEXT: ret %load1 = load <4 x i32>, <4 x i32>* %A %load2 = load <4 x i32>, <4 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -1173,9 +1587,13 @@ define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) } define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind { -;CHECK-LABEL: umlsl_lane_4s: -;CHECK-NOT: dup -;CHECK: umlsl.4s +; CHECK-LABEL: umlsl_lane_4s: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: umlsl.4s v0, v1, v2[1] +; CHECK-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = load <4 x i32>, <4 x i32>* %C @@ -1186,9 +1604,13 @@ define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nou } define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind { -;CHECK-LABEL: umlsl_lane_2d: -;CHECK-NOT: dup -;CHECK: umlsl.2d +; CHECK-LABEL: umlsl_lane_2d: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: ldr d2, [x1] +; CHECK-NEXT: ldr q0, [x2] +; CHECK-NEXT: umlsl.2d v0, v1, v2[1] +; CHECK-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = load <2 x i64>, <2 x i64>* %C @@ -1201,35 +1623,39 @@ define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nou ; Scalar FMULX define float @fmulxs(float %a, float %b) nounwind { ; CHECK-LABEL: fmulxs: -; CHECK-NEXT: fmulx s0, s0, s1 +; CHECK: // %bb.0: +; CHECK-NEXT: fmulx s0, s0, s1 +; CHECK-NEXT: ret %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind -; CHECK-NEXT: ret ret float %fmulx.i } define double @fmulxd(double %a, double %b) nounwind { ; CHECK-LABEL: fmulxd: -; CHECK-NEXT: fmulx d0, d0, d1 +; CHECK: // %bb.0: +; CHECK-NEXT: fmulx d0, d0, d1 +; CHECK-NEXT: ret %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind -; CHECK-NEXT: ret ret double %fmulx.i } define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind { ; CHECK-LABEL: fmulxs_lane: -; CHECK-NEXT: fmulx.s s0, s0, v1[3] +; CHECK: // %bb.0: +; CHECK-NEXT: fmulx.s s0, s0, v1[3] +; CHECK-NEXT: ret %b = extractelement <4 x float> %vec, i32 3 %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind -; CHECK-NEXT: ret ret float %fmulx.i } define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind { ; CHECK-LABEL: fmulxd_lane: -; CHECK-NEXT: fmulx.d d0, d0, v1[1] +; CHECK: // %bb.0: +; CHECK-NEXT: fmulx.d d0, d0, v1[1] +; CHECK-NEXT: ret %b = extractelement <2 x double> %vec, i32 1 %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind -; CHECK-NEXT: ret ret double %fmulx.i } @@ -1239,8 +1665,9 @@ declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind { ; CHECK-LABEL: smull2_8h_simple: -; CHECK-NEXT: smull2.8h v0, v0, v1 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: smull2.8h v0, v0, v1 +; CHECK-NEXT: ret %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2 @@ -1249,7 +1676,9 @@ define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind { define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind { ; CHECK-LABEL: foo0: -; CHECK: smull2.8h v0, v0, v1 +; CHECK: // %bb.0: +; CHECK-NEXT: smull2.8h v0, v0, v1 +; CHECK-NEXT: ret %tmp = bitcast <16 x i8> %a to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8> @@ -1262,7 +1691,9 @@ define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind { define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind { ; CHECK-LABEL: foo1: -; CHECK: smull2.4s v0, v0, v1 +; CHECK: // %bb.0: +; CHECK-NEXT: smull2.4s v0, v0, v1 +; CHECK-NEXT: ret %tmp = bitcast <8 x i16> %a to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> @@ -1275,7 +1706,9 @@ define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind { define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind { ; CHECK-LABEL: foo2: -; CHECK: smull2.2d v0, v0, v1 +; CHECK: // %bb.0: +; CHECK-NEXT: smull2.2d v0, v0, v1 +; CHECK-NEXT: ret %tmp = bitcast <4 x i32> %a to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> @@ -1288,7 +1721,9 @@ define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind { define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind { ; CHECK-LABEL: foo3: -; CHECK: umull2.8h v0, v0, v1 +; CHECK: // %bb.0: +; CHECK-NEXT: umull2.8h v0, v0, v1 +; CHECK-NEXT: ret %tmp = bitcast <16 x i8> %a to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8> @@ -1301,7 +1736,9 @@ define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind { define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind { ; CHECK-LABEL: foo4: -; CHECK: umull2.4s v0, v0, v1 +; CHECK: // %bb.0: +; CHECK-NEXT: umull2.4s v0, v0, v1 +; CHECK-NEXT: ret %tmp = bitcast <8 x i16> %a to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16> @@ -1314,7 +1751,9 @@ define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind { define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind { ; CHECK-LABEL: foo5: -; CHECK: umull2.2d v0, v0, v1 +; CHECK: // %bb.0: +; CHECK-NEXT: umull2.2d v0, v0, v1 +; CHECK-NEXT: ret %tmp = bitcast <4 x i32> %a to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> @@ -1327,8 +1766,10 @@ define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind { define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: foo6: -; CHECK-NEXT: smull2.4s v0, v1, v2[1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: smull2.4s v0, v1, v2[1] +; CHECK-NEXT: ret entry: %0 = bitcast <8 x i16> %b to <2 x i64> %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> @@ -1340,8 +1781,10 @@ entry: define <4 x i32> @foo6a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: foo6a: -; CHECK-NEXT: smull.4s v0, v1, v2[1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: smull.4s v0, v1, v2[1] +; CHECK-NEXT: ret entry: %0 = bitcast <8 x i16> %b to <2 x i64> %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> @@ -1353,8 +1796,10 @@ entry: define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: foo7: -; CHECK-NEXT: smull2.2d v0, v1, v2[1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: smull2.2d v0, v1, v2[1] +; CHECK-NEXT: ret entry: %0 = bitcast <4 x i32> %b to <2 x i64> %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> @@ -1366,8 +1811,10 @@ entry: define <2 x i64> @foo7a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: foo7a: -; CHECK-NEXT: smull.2d v0, v1, v2[1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: smull.2d v0, v1, v2[1] +; CHECK-NEXT: ret entry: %0 = bitcast <4 x i32> %b to <2 x i64> %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> @@ -1380,8 +1827,10 @@ entry: define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: foo8: -; CHECK-NEXT: umull2.4s v0, v1, v2[1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: umull2.4s v0, v1, v2[1] +; CHECK-NEXT: ret entry: %0 = bitcast <8 x i16> %b to <2 x i64> %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> @@ -1393,8 +1842,10 @@ entry: define <4 x i32> @foo8a(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: foo8a: -; CHECK-NEXT: umull.4s v0, v1, v2[1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: umull.4s v0, v1, v2[1] +; CHECK-NEXT: ret entry: %0 = bitcast <8 x i16> %b to <2 x i64> %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> @@ -1406,8 +1857,10 @@ entry: define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: foo9: -; CHECK-NEXT: umull2.2d v0, v1, v2[1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: umull2.2d v0, v1, v2[1] +; CHECK-NEXT: ret entry: %0 = bitcast <4 x i32> %b to <2 x i64> %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> @@ -1419,8 +1872,10 @@ entry: define <2 x i64> @foo9a(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp { ; CHECK-LABEL: foo9a: -; CHECK-NEXT: umull.2d v0, v1, v2[1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: umull.2d v0, v1, v2[1] +; CHECK-NEXT: ret entry: %0 = bitcast <4 x i32> %b to <2 x i64> %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> @@ -1432,9 +1887,9 @@ entry: define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind { ; CHECK-LABEL: bar0: -; CHECK: smlal2.8h v0, v1, v2 -; CHECK-NEXT: ret - +; CHECK: // %bb.0: +; CHECK-NEXT: smlal2.8h v0, v1, v2 +; CHECK-NEXT: ret %tmp = bitcast <16 x i8> %b to <2 x i64> %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8> @@ -1448,9 +1903,9 @@ define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind { define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind { ; CHECK-LABEL: bar1: -; CHECK: smlal2.4s v0, v1, v2 -; CHECK-NEXT: ret - +; CHECK: // %bb.0: +; CHECK-NEXT: smlal2.4s v0, v1, v2 +; CHECK-NEXT: ret %tmp = bitcast <8 x i16> %b to <2 x i64> %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16> @@ -1464,9 +1919,9 @@ define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind { define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind { ; CHECK-LABEL: bar2: -; CHECK: smlal2.2d v0, v1, v2 -; CHECK-NEXT: ret - +; CHECK: // %bb.0: +; CHECK-NEXT: smlal2.2d v0, v1, v2 +; CHECK-NEXT: ret %tmp = bitcast <4 x i32> %b to <2 x i64> %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32> @@ -1480,9 +1935,9 @@ define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind { define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind { ; CHECK-LABEL: bar3: -; CHECK: umlal2.8h v0, v1, v2 -; CHECK-NEXT: ret - +; CHECK: // %bb.0: +; CHECK-NEXT: umlal2.8h v0, v1, v2 +; CHECK-NEXT: ret %tmp = bitcast <16 x i8> %b to <2 x i64> %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8> @@ -1496,9 +1951,9 @@ define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind { define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind { ; CHECK-LABEL: bar4: -; CHECK: umlal2.4s v0, v1, v2 -; CHECK-NEXT: ret - +; CHECK: // %bb.0: +; CHECK-NEXT: umlal2.4s v0, v1, v2 +; CHECK-NEXT: ret %tmp = bitcast <8 x i16> %b to <2 x i64> %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16> @@ -1512,9 +1967,9 @@ define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind { define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind { ; CHECK-LABEL: bar5: -; CHECK: umlal2.2d v0, v1, v2 -; CHECK-NEXT: ret - +; CHECK: // %bb.0: +; CHECK-NEXT: umlal2.2d v0, v1, v2 +; CHECK-NEXT: ret %tmp = bitcast <4 x i32> %b to <2 x i64> %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32> @@ -1528,8 +1983,10 @@ define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind { define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind { ; CHECK-LABEL: mlal2_1: -; CHECK: smlal2.4s v0, v1, v2[3] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: smlal2.4s v0, v1, v2[3] +; CHECK-NEXT: ret %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> %tmp = bitcast <8 x i16> %b to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> @@ -1544,8 +2001,10 @@ define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind { define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind { ; CHECK-LABEL: mlal2_2: -; CHECK: smlal2.2d v0, v1, v2[1] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: smlal2.2d v0, v1, v2[1] +; CHECK-NEXT: ret %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> %tmp = bitcast <4 x i32> %b to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> @@ -1560,9 +2019,10 @@ define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind { define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind { ; CHECK-LABEL: mlal2_4: -; CHECK: umlal2.4s v0, v1, v2[2] -; CHECK-NEXT: ret - +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: umlal2.4s v0, v1, v2[2] +; CHECK-NEXT: ret %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> %tmp = bitcast <8 x i16> %b to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> @@ -1577,8 +2037,10 @@ define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind { define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind { ; CHECK-LABEL: mlal2_5: -; CHECK: umlal2.2d v0, v1, v2[0] -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: umlal2.2d v0, v1, v2[0] +; CHECK-NEXT: ret %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer %tmp = bitcast <4 x i32> %b to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> @@ -1593,10 +2055,12 @@ define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind { ; rdar://12328502 define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp { -entry: ; CHECK-LABEL: vmulq_n_f64: -; CHECK-NOT: dup.2d -; CHECK: fmul.2d v0, v0, v1[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmul.2d v0, v0, v1[0] +; CHECK-NEXT: ret +entry: %vecinit.i = insertelement <2 x double> undef, double %y, i32 0 %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1 %mul.i = fmul <2 x double> %vecinit1.i, %x @@ -1604,10 +2068,12 @@ entry: } define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp { -entry: ; CHECK-LABEL: vmulq_n_f32: -; CHECK-NOT: dup.4s -; CHECK: fmul.4s v0, v0, v1[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-NEXT: fmul.4s v0, v0, v1[0] +; CHECK-NEXT: ret +entry: %vecinit.i = insertelement <4 x float> undef, float %y, i32 0 %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1 %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2 @@ -1617,10 +2083,12 @@ entry: } define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp { -entry: ; CHECK-LABEL: vmul_n_f32: -; CHECK-NOT: dup.2s -; CHECK: fmul.2s v0, v0, v1[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-NEXT: fmul.2s v0, v0, v1[0] +; CHECK-NEXT: ret +entry: %vecinit.i = insertelement <2 x float> undef, float %y, i32 0 %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1 %mul.i = fmul <2 x float> %vecinit1.i, %x @@ -1628,11 +2096,11 @@ entry: } define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp { +; CHECK-LABEL: vmla_laneq_s16_test: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mla.4h v0, v1, v2[6] +; CHECK-NEXT: ret entry: -; CHECK: vmla_laneq_s16_test -; CHECK-NOT: ext -; CHECK: mla.4h v0, v1, v2[6] -; CHECK-NEXT: ret %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %b %add = add <4 x i16> %mul, %a @@ -1640,11 +2108,11 @@ entry: } define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp { +; CHECK-LABEL: vmla_laneq_s32_test: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mla.2s v0, v1, v2[3] +; CHECK-NEXT: ret entry: -; CHECK: vmla_laneq_s32_test -; CHECK-NOT: ext -; CHECK: mla.2s v0, v1, v2[3] -; CHECK-NEXT: ret %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %b %add = add <2 x i32> %mul, %a @@ -1652,11 +2120,11 @@ entry: } define <8 x i16> @not_really_vmlaq_laneq_s16_test(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone ssp { +; CHECK-LABEL: not_really_vmlaq_laneq_s16_test: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mla.8h v0, v1, v2[5] +; CHECK-NEXT: ret entry: -; CHECK: not_really_vmlaq_laneq_s16_test -; CHECK-NOT: ext -; CHECK: mla.8h v0, v1, v2[5] -; CHECK-NEXT: ret %shuffle1 = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> %shuffle2 = shufflevector <4 x i16> %shuffle1, <4 x i16> undef, <8 x i32> %mul = mul <8 x i16> %shuffle2, %b @@ -1665,11 +2133,11 @@ entry: } define <4 x i32> @not_really_vmlaq_laneq_s32_test(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone ssp { +; CHECK-LABEL: not_really_vmlaq_laneq_s32_test: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mla.4s v0, v1, v2[3] +; CHECK-NEXT: ret entry: -; CHECK: not_really_vmlaq_laneq_s32_test -; CHECK-NOT: ext -; CHECK: mla.4s v0, v1, v2[3] -; CHECK-NEXT: ret %shuffle1 = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> %shuffle2 = shufflevector <2 x i32> %shuffle1, <2 x i32> undef, <4 x i32> %mul = mul <4 x i32> %shuffle2, %b @@ -1678,54 +2146,55 @@ entry: } define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp { +; CHECK-LABEL: vmull_laneq_s16_test: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull.4s v0, v0, v1[6] +; CHECK-NEXT: ret entry: -; CHECK: vmull_laneq_s16_test -; CHECK-NOT: ext -; CHECK: smull.4s v0, v0, v1[6] -; CHECK-NEXT: ret %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2 ret <4 x i32> %vmull2.i } define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp { +; CHECK-LABEL: vmull_laneq_s32_test: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull.2d v0, v0, v1[2] +; CHECK-NEXT: ret entry: -; CHECK: vmull_laneq_s32_test -; CHECK-NOT: ext -; CHECK: smull.2d v0, v0, v1[2] -; CHECK-NEXT: ret %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2 ret <2 x i64> %vmull2.i } define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp { +; CHECK-LABEL: vmull_laneq_u16_test: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umull.4s v0, v0, v1[6] +; CHECK-NEXT: ret entry: -; CHECK: vmull_laneq_u16_test -; CHECK-NOT: ext -; CHECK: umull.4s v0, v0, v1[6] -; CHECK-NEXT: ret %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2 ret <4 x i32> %vmull2.i } define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp { +; CHECK-LABEL: vmull_laneq_u32_test: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umull.2d v0, v0, v1[2] +; CHECK-NEXT: ret entry: -; CHECK: vmull_laneq_u32_test -; CHECK-NOT: ext -; CHECK: umull.2d v0, v0, v1[2] -; CHECK-NEXT: ret %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2 ret <2 x i64> %vmull2.i } define <4 x i32> @vmull_low_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp { +; CHECK-LABEL: vmull_low_n_s16_test: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup.4h v0, w0 +; CHECK-NEXT: smull.4s v0, v1, v0 +; CHECK-NEXT: ret entry: -; CHECK: vmull_low_n_s16_test -; CHECK-NOT: ext -; CHECK: smull.4s -; CHECK-NEXT: ret %conv = trunc i32 %d to i16 %0 = bitcast <8 x i16> %b to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> @@ -1739,11 +2208,12 @@ entry: } define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp { +; CHECK-LABEL: vmull_high_n_s16_test: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup.8h v0, w0 +; CHECK-NEXT: smull2.4s v0, v1, v0 +; CHECK-NEXT: ret entry: -; CHECK: vmull_high_n_s16_test -; CHECK-NOT: ext -; CHECK: smull2.4s -; CHECK-NEXT: ret %conv = trunc i32 %d to i16 %0 = bitcast <8 x i16> %b to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> @@ -1757,11 +2227,12 @@ entry: } define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp { +; CHECK-LABEL: vmull_high_n_s32_test: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup.4s v0, w0 +; CHECK-NEXT: smull2.2d v0, v1, v0 +; CHECK-NEXT: ret entry: -; CHECK: vmull_high_n_s32_test -; CHECK-NOT: ext -; CHECK: smull2.2d -; CHECK-NEXT: ret %0 = bitcast <4 x i32> %b to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> @@ -1772,11 +2243,12 @@ entry: } define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp { +; CHECK-LABEL: vmull_high_n_u16_test: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup.8h v0, w0 +; CHECK-NEXT: umull2.4s v0, v1, v0 +; CHECK-NEXT: ret entry: -; CHECK: vmull_high_n_u16_test -; CHECK-NOT: ext -; CHECK: umull2.4s -; CHECK-NEXT: ret %conv = trunc i32 %d to i16 %0 = bitcast <8 x i16> %b to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> @@ -1790,11 +2262,12 @@ entry: } define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp { +; CHECK-LABEL: vmull_high_n_u32_test: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup.4s v0, w0 +; CHECK-NEXT: umull2.2d v0, v1, v0 +; CHECK-NEXT: ret entry: -; CHECK: vmull_high_n_u32_test -; CHECK-NOT: ext -; CHECK: umull2.2d -; CHECK-NEXT: ret %0 = bitcast <4 x i32> %b to <2 x i64> %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32> @@ -1806,9 +2279,9 @@ entry: define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vmul_built_dup_test: -; CHECK-NOT: ins -; CHECK-NOT: dup -; CHECK: mul.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[1] +; CHECK: // %bb.0: +; CHECK-NEXT: mul.4s v0, v0, v1[1] +; CHECK-NEXT: ret %vget_lane = extractelement <4 x i32> %b, i32 1 %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0 %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1 @@ -1820,9 +2293,10 @@ define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) { define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) { ; CHECK-LABEL: vmul_built_dup_fromsmall_test: -; CHECK-NOT: ins -; CHECK-NOT: dup -; CHECK: mul.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[3] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mul.4h v0, v0, v1[3] +; CHECK-NEXT: ret %vget_lane = extractelement <4 x i16> %b, i32 3 %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0 %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1 @@ -1834,9 +2308,10 @@ define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) { define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) { ; CHECK-LABEL: vmulq_built_dup_fromsmall_test: -; CHECK-NOT: ins -; CHECK-NOT: dup -; CHECK: mul.8h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mul.8h v0, v0, v1[0] +; CHECK-NEXT: ret %vget_lane = extractelement <4 x i16> %b, i32 0 %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0 %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1 @@ -1852,9 +2327,9 @@ define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) { define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) { ; CHECK-LABEL: mull_from_two_extracts: -; CHECK-NOT: ext -; CHECK: sqdmull2.2d - +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmull2.2d v0, v0, v1 +; CHECK-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> @@ -1864,9 +2339,9 @@ define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) { define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { ; CHECK-LABEL: mlal_from_two_extracts: -; CHECK-NOT: ext -; CHECK: sqdmlal2.2d - +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmlal2.2d v0, v1, v2 +; CHECK-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> @@ -1877,8 +2352,10 @@ define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) { ; CHECK-LABEL: mull_from_extract_dup_low: -; CHECK-NOT: ext -; CHECK: sqdmull.2d +; CHECK: // %bb.0: +; CHECK-NEXT: dup.2s v1, w0 +; CHECK-NEXT: sqdmull.2d v0, v0, v1 +; CHECK-NEXT: ret %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 @@ -1890,8 +2367,10 @@ define <2 x i64> @mull_from_extract_dup_low(<4 x i32> %lhs, i32 %rhs) { define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) { ; CHECK-LABEL: mull_from_extract_dup_high: -; CHECK-NOT: ext -; CHECK: sqdmull2.2d +; CHECK: // %bb.0: +; CHECK-NEXT: dup.4s v1, w0 +; CHECK-NEXT: sqdmull2.2d v0, v0, v1 +; CHECK-NEXT: ret %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0 %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1 @@ -1903,8 +2382,10 @@ define <2 x i64> @mull_from_extract_dup_high(<4 x i32> %lhs, i32 %rhs) { define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) { ; CHECK-LABEL: pmull_from_extract_dup_low: -; CHECK-NOT: ext -; CHECK: pmull.8h +; CHECK: // %bb.0: +; CHECK-NEXT: dup.8b v1, w0 +; CHECK-NEXT: pmull.8h v0, v0, v1 +; CHECK-NEXT: ret %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0 %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> @@ -1916,8 +2397,10 @@ define <8 x i16> @pmull_from_extract_dup_low(<16 x i8> %lhs, i8 %rhs) { define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) { ; CHECK-LABEL: pmull_from_extract_dup_high: -; CHECK-NOT: ext -; CHECK: pmull2.8h +; CHECK: // %bb.0: +; CHECK-NEXT: dup.16b v1, w0 +; CHECK-NEXT: pmull2.8h v0, v0, v1 +; CHECK-NEXT: ret %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0 %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> @@ -1929,9 +2412,11 @@ define <8 x i16> @pmull_from_extract_dup_high(<16 x i8> %lhs, i8 %rhs) { define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) { ; CHECK-LABEL: pmull_from_extract_duplane_low: -; CHECK-NOT: ext -; CHECK: pmull.8h - +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: dup.8b v1, v1[0] +; CHECK-NEXT: pmull.8h v0, v0, v1 +; CHECK-NEXT: ret %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> @@ -1941,9 +2426,11 @@ define <8 x i16> @pmull_from_extract_duplane_low(<16 x i8> %lhs, <8 x i8> %rhs) define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) { ; CHECK-LABEL: pmull_from_extract_duplane_high: -; CHECK-NOT: ext -; CHECK: pmull2.8h - +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: dup.16b v1, v1[0] +; CHECK-NEXT: pmull2.8h v0, v0, v1 +; CHECK-NEXT: ret %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> @@ -1953,9 +2440,9 @@ define <8 x i16> @pmull_from_extract_duplane_high(<16 x i8> %lhs, <8 x i8> %rhs) define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rhs) { ; CHECK-LABEL: sqdmull_from_extract_duplane_low: -; CHECK-NOT: ext -; CHECK: sqdmull.2d - +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmull.2d v0, v0, v1[0] +; CHECK-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> @@ -1965,9 +2452,9 @@ define <2 x i64> @sqdmull_from_extract_duplane_low(<4 x i32> %lhs, <4 x i32> %rh define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %rhs) { ; CHECK-LABEL: sqdmull_from_extract_duplane_high: -; CHECK-NOT: ext -; CHECK: sqdmull2.2d - +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmull2.2d v0, v0, v1[0] +; CHECK-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> @@ -1977,9 +2464,9 @@ define <2 x i64> @sqdmull_from_extract_duplane_high(<4 x i32> %lhs, <4 x i32> %r define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { ; CHECK-LABEL: sqdmlal_from_extract_duplane_low: -; CHECK-NOT: ext -; CHECK: sqdmlal.2d - +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmlal.2d v0, v1, v2[0] +; CHECK-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> @@ -1990,9 +2477,9 @@ define <2 x i64> @sqdmlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> % define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { ; CHECK-LABEL: sqdmlal_from_extract_duplane_high: -; CHECK-NOT: ext -; CHECK: sqdmlal2.2d - +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmlal2.2d v0, v1, v2[0] +; CHECK-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> @@ -2003,9 +2490,9 @@ define <2 x i64> @sqdmlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { ; CHECK-LABEL: umlal_from_extract_duplane_low: -; CHECK-NOT: ext -; CHECK: umlal.2d - +; CHECK: // %bb.0: +; CHECK-NEXT: umlal.2d v0, v1, v2[0] +; CHECK-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> @@ -2016,9 +2503,9 @@ define <2 x i64> @umlal_from_extract_duplane_low(<2 x i64> %accum, <4 x i32> %lh define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) { ; CHECK-LABEL: umlal_from_extract_duplane_high: -; CHECK-NOT: ext -; CHECK: umlal2.2d - +; CHECK: // %bb.0: +; CHECK-NEXT: umlal2.2d v0, v1, v2[0] +; CHECK-NEXT: ret %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> @@ -2029,7 +2516,9 @@ define <2 x i64> @umlal_from_extract_duplane_high(<2 x i64> %accum, <4 x i32> %l define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) { ; CHECK-LABEL: scalar_fmla_from_extract_v4f32: -; CHECK: fmla.s s0, s1, v2[3] +; CHECK: // %bb.0: +; CHECK-NEXT: fmla.s s0, s1, v2[3] +; CHECK-NEXT: ret %rhs = extractelement <4 x float> %rvec, i32 3 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) ret float %res @@ -2037,7 +2526,10 @@ define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x floa define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) { ; CHECK-LABEL: scalar_fmla_from_extract_v2f32: -; CHECK: fmla.s s0, s1, v2[1] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmla.s s0, s1, v2[1] +; CHECK-NEXT: ret %rhs = extractelement <2 x float> %rvec, i32 1 %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) ret float %res @@ -2045,7 +2537,9 @@ define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x floa define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) { ; CHECK-LABEL: scalar_fmls_from_extract_v4f32: -; CHECK: fmls.s s0, s1, v2[3] +; CHECK: // %bb.0: +; CHECK-NEXT: fmls.s s0, s1, v2[3] +; CHECK-NEXT: ret %rhs.scal = extractelement <4 x float> %rvec, i32 3 %rhs = fsub float -0.0, %rhs.scal %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) @@ -2054,7 +2548,10 @@ define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x floa define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) { ; CHECK-LABEL: scalar_fmls_from_extract_v2f32: -; CHECK: fmls.s s0, s1, v2[1] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmls.s s0, s1, v2[1] +; CHECK-NEXT: ret %rhs.scal = extractelement <2 x float> %rvec, i32 1 %rhs = fsub float -0.0, %rhs.scal %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum) @@ -2065,7 +2562,9 @@ declare float @llvm.fma.f32(float, float, float) define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) { ; CHECK-LABEL: scalar_fmla_from_extract_v2f64: -; CHECK: fmla.d d0, d1, v2[1] +; CHECK: // %bb.0: +; CHECK-NEXT: fmla.d d0, d1, v2[1] +; CHECK-NEXT: ret %rhs = extractelement <2 x double> %rvec, i32 1 %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum) ret double %res @@ -2073,7 +2572,9 @@ define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x d define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) { ; CHECK-LABEL: scalar_fmls_from_extract_v2f64: -; CHECK: fmls.d d0, d1, v2[1] +; CHECK: // %bb.0: +; CHECK-NEXT: fmls.d d0, d1, v2[1] +; CHECK-NEXT: ret %rhs.scal = extractelement <2 x double> %rvec, i32 1 %rhs = fsub double -0.0, %rhs.scal %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum) @@ -2084,7 +2585,9 @@ declare double @llvm.fma.f64(double, double, double) define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) { ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32: -; CHECK: fmls.2s v0, v1, v2[3] +; CHECK: // %bb.0: +; CHECK-NEXT: fmls.2s v0, v1, v2[3] +; CHECK-NEXT: ret %rhs_neg = fsub <4 x float> , %rhs %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum) @@ -2093,7 +2596,10 @@ define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) { ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1: -; CHECK: fmls.2s v0, v1, v2[1] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmls.2s v0, v1, v2[1] +; CHECK-NEXT: ret %rhs_neg = fsub <2 x float> , %rhs %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum) @@ -2102,7 +2608,9 @@ define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) { ; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32: -; CHECK: fmls.4s v0, v1, v2[3] +; CHECK: // %bb.0: +; CHECK-NEXT: fmls.4s v0, v1, v2[3] +; CHECK-NEXT: ret %rhs_neg = fsub <4 x float> , %rhs %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum) @@ -2111,7 +2619,10 @@ define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) { ; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1: -; CHECK: fmls.4s v0, v1, v2[1] +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmls.4s v0, v1, v2[1] +; CHECK-NEXT: ret %rhs_neg = fsub <2 x float> , %rhs %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum) @@ -2120,7 +2631,9 @@ define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) { ; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64: -; CHECK: fmls.2d v0, v1, v2[1] +; CHECK: // %bb.0: +; CHECK-NEXT: fmls.2d v0, v1, v2[1] +; CHECK-NEXT: ret %rhs_neg = fsub <2 x double> , %rhs %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum) @@ -2129,29 +2642,45 @@ define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind { ; CHECK-LABEL: test_fmul_v1f64: -; CHECK: fmul +; CHECK: // %bb.0: +; CHECK-NEXT: fmul d0, d0, d1 +; CHECK-NEXT: ret %prod = fmul <1 x double> %L, %R ret <1 x double> %prod } define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind { ; CHECK-LABEL: test_fdiv_v1f64: -; CHECK-LABEL: fdiv +; CHECK: // %bb.0: +; CHECK-NEXT: fdiv d0, d0, d1 +; CHECK-NEXT: ret %prod = fdiv <1 x double> %L, %R ret <1 x double> %prod } define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind { -;CHECK-LABEL: sqdmlal_d: -;CHECK: sqdmlal +; CHECK-LABEL: sqdmlal_d: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d0, x2 +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: fmov s2, w1 +; CHECK-NEXT: sqdmlal d0, s1, s2 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B) %tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4) ret i64 %tmp5 } define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind { -;CHECK-LABEL: sqdmlsl_d: -;CHECK: sqdmlsl +; CHECK-LABEL: sqdmlsl_d: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d0, x2 +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: fmov s2, w1 +; CHECK-NEXT: sqdmlsl d0, s1, s2 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B) %tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4) ret i64 %tmp5 @@ -2159,14 +2688,20 @@ define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind { define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind { ; CHECK-LABEL: test_pmull_64: -; CHECK: pmull.1q +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: fmov d1, x1 +; CHECK-NEXT: pmull.1q v0, v0, v1 +; CHECK-NEXT: ret %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r) ret <16 x i8> %val } define <16 x i8> @test_pmull_high_64(<2 x i64> %l, <2 x i64> %r) nounwind { ; CHECK-LABEL: test_pmull_high_64: -; CHECK: pmull2.1q +; CHECK: // %bb.0: +; CHECK-NEXT: pmull2.1q v0, v0, v1 +; CHECK-NEXT: ret %l_hi = extractelement <2 x i64> %l, i32 1 %r_hi = extractelement <2 x i64> %r, i32 1 %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi) @@ -2177,7 +2712,14 @@ declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind { ; CHECK-LABEL: test_mul_v1i64: -; CHECK: mul +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: mul x8, x9, x8 +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret %prod = mul <1 x i64> %lhs, %rhs ret <1 x i64> %prod }