From 6f50c7fe4aa8c7792be0258595f2769c5a70e540 Mon Sep 17 00:00:00 2001 From: Ahmed Bougacha Date: Tue, 4 Apr 2017 22:55:53 +0000 Subject: [PATCH] [AArch64] Avoid partial register deps on insertelt of load into lane 0. This improves upon r246462: that prevented FMOVs from being emitted for the cross-class INSERT_SUBREGs by disabling the formation of INSERT_SUBREGs of LOAD. But the ld1.s that we started selecting caused us to introduce partial dependencies on the vector register. Avoid that by using SCALAR_TO_VECTOR: it's a first-class citizen that is folded away by many patterns, including the scalar LDRS that we want in this case. Credit goes to Adam for finding the issue! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@299482 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelLowering.cpp | 16 +++++----------- .../CodeGen/AArch64/arm64-indexed-vector-ldst.ll | 10 +++++----- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 0d4a9943ecc..ea184d55e44 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6587,19 +6587,13 @@ FailedModImm: SDValue Op0 = Op.getOperand(0); unsigned ElemSize = VT.getScalarSizeInBits(); unsigned i = 0; - // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to + // For 32 and 64 bit types, use SCALAR_TO_VECTOR for lane zero to // a) Avoid a RMW dependency on the full vector register, and // b) Allow the register coalescer to fold away the copy if the - // value is already in an S or D register. - // Do not do this for UNDEF/LOAD nodes because we have better patterns - // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR. - if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD && - (ElemSize == 32 || ElemSize == 64)) { - unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; - MachineSDNode *N = - DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, - DAG.getTargetConstant(SubIdx, dl, MVT::i32)); - Vec = SDValue(N, 0); + // value is already in an S or D register, and we're forced to emit an + // INSERT_SUBREG that we can't fold anywhere. + if (!Op0.isUndef() && (ElemSize == 32 || ElemSize == 64)) { + Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0); ++i; } for (; i < NumElts; ++i) { diff --git a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll index 071b2d0dbca..d344084ef62 100644 --- a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll +++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll @@ -6216,11 +6216,11 @@ define <4 x i16> @test_v4i16_post_reg_ld1lane_forced_narrow(i16* %bar, i16** %pt declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) ; CHECK-LABEL: test_ld1lane_build: -; CHECK-DAG: ld1.s { [[REG0:v[0-9]+]] }[0], [x0] -; CHECK-DAG: ld1.s { [[REG0:v[0-9]+]] }[1], [x1] -; CHECK-DAG: ld1.s { [[REG1:v[0-9]+]] }[0], [x2] -; CHECK-DAG: ld1.s { [[REG1:v[0-9]+]] }[1], [x3] -; CHECK: sub.2s v[[REGNUM2:[0-9]+]], [[REG0]], [[REG1]] +; CHECK-DAG: ldr s[[REGNUM0:[0-9]+]], [x0] +; CHECK-DAG: ld1.s { v[[REGNUM0:[0-9]+]] }[1], [x1] +; CHECK-DAG: ldr s[[REGNUM1:[0-9]+]], [x2] +; CHECK-DAG: ld1.s { v[[REGNUM1:[0-9]+]] }[1], [x3] +; CHECK: sub.2s v[[REGNUM2:[0-9]+]], v[[REGNUM0]], v[[REGNUM1]] ; CHECK-NEXT: str d[[REGNUM2]], [x4] ; CHECK-NEXT: ret define void @test_ld1lane_build(i32* %ptr0, i32* %ptr1, i32* %ptr2, i32* %ptr3, <2 x i32>* %out) { -- 2.40.0