From 7e7d3003e4d30d3fb841c0bd19da9d8d2f47f56c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 23 Sep 2019 09:38:53 +0000 Subject: [PATCH] [AArch64] support neon_sshl and neon_ushl in performIntrinsicCombine. Try to generate ushll/sshll for aarch64_neon_ushl/aarch64_neon_sshl, if their first operand is extended and the second operand is a constant Also adds a few tests marked with FIXME, where we can further increase codegen. Reviewers: t.p.northover, samparker, dmgreen, anemet Reviewed By: anemet Differential Revision: https://reviews.llvm.org/D62308 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@372565 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelLowering.cpp | 25 +++ test/CodeGen/AArch64/arm64-vshift.ll | 190 ++++++++++++++++++++- 2 files changed, 209 insertions(+), 6 deletions(-) diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 5048371282b..fa64ef67202 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10332,6 +10332,29 @@ static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { Opcode = AArch64ISD::SQSHLU_I; IsRightShift = false; break; + case Intrinsic::aarch64_neon_sshl: + case Intrinsic::aarch64_neon_ushl: { + // ushll/ushll2 provide unsigned shifts with immediate operands and + // sshll/sshll2 provide signed shifts with immediates, so we have to make + // sure we only match patterns here we can later match to them. + SDValue Op0 = N->getOperand(1); + if (Op0.getNode()->getOpcode() != (IID == Intrinsic::aarch64_neon_ushl + ? ISD::ZERO_EXTEND + : ISD::SIGN_EXTEND)) + return SDValue(); + + EVT FromType = Op0.getOperand(0).getValueType(); + EVT ToType = Op0.getValueType(); + unsigned FromSize = FromType.getScalarSizeInBits(); + if (!FromType.isVector() || !ToType.isVector() || + (FromSize != 8 && FromSize != 16 && FromSize != 32) || + 2 * FromSize != ToType.getScalarSizeInBits()) + return SDValue(); + + Opcode = AArch64ISD::VSHL; + IsRightShift = false; + break; + } } if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { @@ -10418,6 +10441,8 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_neon_sqshlu: case Intrinsic::aarch64_neon_srshl: case Intrinsic::aarch64_neon_urshl: + case Intrinsic::aarch64_neon_sshl: + case Intrinsic::aarch64_neon_ushl: return tryCombineShiftImm(IID, N, DAG); case Intrinsic::aarch64_crc32b: case Intrinsic::aarch64_crc32cb: diff --git a/test/CodeGen/AArch64/arm64-vshift.ll b/test/CodeGen/AArch64/arm64-vshift.ll index 6b0fe40b5a0..8d4ef1d5cbe 100644 --- a/test/CodeGen/AArch64/arm64-vshift.ll +++ b/test/CodeGen/AArch64/arm64-vshift.ll @@ -1192,6 +1192,100 @@ define <2 x i64> @ushll2_2d(<4 x i32>* %A) nounwind { ret <2 x i64> %tmp3 } +declare <16 x i8> @llvm.aarch64.neon.ushl.v16i8(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64>, <2 x i64>) + +define <8 x i16> @neon.ushll8h_constant_shift(<8 x i8>* %A) nounwind { +;CHECK-LABEL: neon.ushll8h_constant_shift +;CHECK: ushll.8h v0, {{v[0-9]+}}, #1 + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = zext <8 x i8> %tmp1 to <8 x i16> + %tmp3 = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> %tmp2, <8 x i16> ) + ret <8 x i16> %tmp3 +} + +define <8 x i16> @neon.ushl8h_no_constant_shift(<8 x i8>* %A) nounwind { +;CHECK-LABEL: neon.ushl8h_no_constant_shift +;CHECK: ushl.8h v0, v0, v0 + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = zext <8 x i8> %tmp1 to <8 x i16> + %tmp3 = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> %tmp2, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +; Here we do not extend to the double the bitwidth, so we cannot fold to ushll. +define <4 x i32> @neon.ushll8h_constant_shift_extend_not_2x(<4 x i8>* %A) nounwind { +;CHECK-LABEL: @neon.ushll8h_constant_shift_extend_not_2x +;CHECK-NOT: ushll.8h v0, +;CHECK: ldrb w8, [x0] +;CHECK: movi.4s v1, #1 +;CHECK: fmov s0, w8 +;CHECK: ldrb w8, [x0, #1] +;CHECK: mov.s v0[1], w8 +;CHECK: ldrb w8, [x0, #2] +;CHECK: mov.s v0[2], w8 +;CHECK: ldrb w8, [x0, #3] +;CHECK: mov.s v0[3], w8 +;CHECK: ushl.4s v0, v0, v1 + %tmp1 = load <4 x i8>, <4 x i8>* %A + %tmp2 = zext <4 x i8> %tmp1 to <4 x i32> + %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %tmp2, <4 x i32> ) + ret <4 x i32> %tmp3 +} + +define <8 x i16> @neon.ushl8_noext_constant_shift(<8 x i16>* %A) nounwind { +; CHECK-LABEL: neon.ushl8_noext_constant_shift +; CHECK: ldr q0, [x0] +; CHECK-NEXT: movi.8h v1, #1 +; CHECK-NEXT: ushl.8h v0, v0, v1 +; CHECK-NEXT: ret + %tmp1 = load <8 x i16>, <8 x i16>* %A + %tmp3 = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> %tmp1, <8 x i16> ) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @neon.ushll4s_constant_shift(<4 x i16>* %A) nounwind { +;CHECK-LABEL: neon.ushll4s_constant_shift +;CHECK: ushll.4s v0, {{v[0-9]+}}, #1 + %tmp1 = load <4 x i16>, <4 x i16>* %A + %tmp2 = zext <4 x i16> %tmp1 to <4 x i32> + %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %tmp2, <4 x i32> ) + ret <4 x i32> %tmp3 +} + +; FIXME: unnecessary ushll.4s v0, v0, #0? +define <4 x i32> @neon.ushll4s_neg_constant_shift(<4 x i16>* %A) nounwind { +; CHECK-LABEL: neon.ushll4s_neg_constant_shift +; CHECK: movi.2d v1, #0xffffffffffffffff +; CHECK: ushll.4s v0, v0, #0 +; CHECK: ushl.4s v0, v0, v1 + %tmp1 = load <4 x i16>, <4 x i16>* %A + %tmp2 = zext <4 x i16> %tmp1 to <4 x i32> + %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %tmp2, <4 x i32> ) + ret <4 x i32> %tmp3 +} + +; FIXME: should be constant folded. +define <4 x i32> @neon.ushll4s_constant_fold() nounwind { +; CHECK-LABEL: neon.ushll4s_constant_fold +; CHECK: movi.4s v1, #1 +; CHECK-NEXT: ushl.4s v0, v0, v1 +; + %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> , <4 x i32> ) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @neon.ushll2d_constant_shift(<2 x i32>* %A) nounwind { +;CHECK-LABEL: neon.ushll2d_constant_shift +;CHECK: ushll.2d v0, {{v[0-9]+}}, #1 + %tmp1 = load <2 x i32>, <2 x i32>* %A + %tmp2 = zext <2 x i32> %tmp1 to <2 x i64> + %tmp3 = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> %tmp2, <2 x i64> ) + ret <2 x i64> %tmp3 +} + define <8 x i16> @sshll8h(<8 x i8>* %A) nounwind { ;CHECK-LABEL: sshll8h: ;CHECK: sshll.8h v0, {{v[0-9]+}}, #1 @@ -1201,21 +1295,105 @@ define <8 x i16> @sshll8h(<8 x i8>* %A) nounwind { ret <8 x i16> %tmp3 } -define <4 x i32> @sshll4s(<4 x i16>* %A) nounwind { -;CHECK-LABEL: sshll4s: +define <2 x i64> @sshll2d(<2 x i32>* %A) nounwind { +;CHECK-LABEL: sshll2d: +;CHECK: sshll.2d v0, {{v[0-9]+}}, #1 + %tmp1 = load <2 x i32>, <2 x i32>* %A + %tmp2 = sext <2 x i32> %tmp1 to <2 x i64> + %tmp3 = shl <2 x i64> %tmp2, + ret <2 x i64> %tmp3 +} + +declare <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8>, <16 x i8>) +declare <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16>, <8 x i16>) +declare <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32>, <4 x i32>) +declare <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64>, <2 x i64>) + +define <16 x i8> @neon.sshl16b_constant_shift(<16 x i8>* %A) nounwind { +;CHECK-LABEL: neon.sshl16b_constant_shift +;CHECK: sshl.16b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} + %tmp1 = load <16 x i8>, <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %tmp1, <16 x i8> ) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @neon.sshll8h_constant_shift(<8 x i8>* %A) nounwind { +;CHECK-LABEL: neon.sshll8h_constant_shift +;CHECK: sshll.8h v0, {{v[0-9]+}}, #1 + %tmp1 = load <8 x i8>, <8 x i8>* %A + %tmp2 = sext <8 x i8> %tmp1 to <8 x i16> + %tmp3 = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> %tmp2, <8 x i16> ) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @neon.sshl4s_wrong_ext_constant_shift(<4 x i8>* %A) nounwind { +;CHECK-LABEL: neon.sshl4s_wrong_ext_constant_shift +;CHECK: sshl.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} + %tmp1 = load <4 x i8>, <4 x i8>* %A + %tmp2 = sext <4 x i8> %tmp1 to <4 x i32> + %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp2, <4 x i32> ) + ret <4 x i32> %tmp3 +} + + +define <4 x i32> @neon.sshll4s_constant_shift(<4 x i16>* %A) nounwind { +;CHECK-LABEL: neon.sshll4s_constant_shift ;CHECK: sshll.4s v0, {{v[0-9]+}}, #1 %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = sext <4 x i16> %tmp1 to <4 x i32> - %tmp3 = shl <4 x i32> %tmp2, + %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp2, <4 x i32> ) ret <4 x i32> %tmp3 } -define <2 x i64> @sshll2d(<2 x i32>* %A) nounwind { -;CHECK-LABEL: sshll2d: +define <4 x i32> @neon.sshll4s_neg_constant_shift(<4 x i16>* %A) nounwind { +;CHECK-LABEL: neon.sshll4s_neg_constant_shift +;CHECK: movi.2d v1, #0xffffffffffffffff +;CHECK: sshll.4s v0, v0, #0 +;CHECK: sshl.4s v0, v0, v1 + %tmp1 = load <4 x i16>, <4 x i16>* %A + %tmp2 = sext <4 x i16> %tmp1 to <4 x i32> + %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp2, <4 x i32> ) + ret <4 x i32> %tmp3 +} + +; FIXME: should be constant folded. +define <4 x i32> @neon.sshl4s_constant_fold() nounwind { +;CHECK-LABEL: neon.sshl4s_constant_fold +;CHECK: sshl.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} + %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> , <4 x i32> ) + ret <4 x i32> %tmp3 +} + +define <4 x i32> @neon.sshl4s_no_fold(<4 x i32>* %A) nounwind { +;CHECK-LABEL: neon.sshl4s_no_fold +;CHECK: sshl.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} + %tmp1 = load <4 x i32>, <4 x i32>* %A + %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %tmp1, <4 x i32> ) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @neon.sshll2d_constant_shift(<2 x i32>* %A) nounwind { +;CHECK-LABEL: neon.sshll2d_constant_shift ;CHECK: sshll.2d v0, {{v[0-9]+}}, #1 %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = sext <2 x i32> %tmp1 to <2 x i64> - %tmp3 = shl <2 x i64> %tmp2, + %tmp3 = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %tmp2, <2 x i64> ) + ret <2 x i64> %tmp3 +} + +; FIXME: should be constant folded. +define <2 x i64> @neon.sshl2d_constant_fold() nounwind { +;CHECK-LABEL: neon.sshl2d_constant_fold +;CHECK: sshl.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} + %tmp3 = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> , <2 x i64> ) + ret <2 x i64> %tmp3 +} + +define <2 x i64> @neon.sshl2d_no_fold(<2 x i64>* %A) nounwind { +;CHECK-LABEL: neon.sshl2d_no_fold +;CHECK: sshl.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} + %tmp2 = load <2 x i64>, <2 x i64>* %A + %tmp3 = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %tmp2, <2 x i64> ) ret <2 x i64> %tmp3 } -- 2.40.0