From: John Brawn Date: Fri, 26 May 2017 13:59:12 +0000 (+0000) Subject: [ARM] Fix lowering of misaligned memcpy/memset X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=bafb2e66d85af00ba60213496cb2e5f1415f514c;p=llvm [ARM] Fix lowering of misaligned memcpy/memset Currently getOptimalMemOpType returns i32 for large enough sizes without checking for alignment, leading to poor code generation when misaligned accesses aren't permitted as we generate a word store then later split it up into byte stores. This means we inadvertantly go over the MaxStoresPerMemcpy limit and for memset we splat the memset value into a word then immediately split it up again. Fix this by leaving it up to FindOptimalMemOpLowering to figure out which type to use, but also fix a bug there where it wasn't correctly checking if misaligned memory accesses are allowed. Differential Revision: https://reviews.llvm.org/D33442 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303990 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index b26f1ce8c5e..177898e1e95 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4779,23 +4779,23 @@ static bool FindOptimalMemOpLowering(std::vector &MemOps, DAG.getMachineFunction()); if (VT == MVT::Other) { - if (DstAlign >= DAG.getDataLayout().getPointerPrefAlignment(DstAS) || - TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign)) { - VT = TLI.getPointerTy(DAG.getDataLayout(), DstAS); - } else { - switch (DstAlign & 7) { - case 0: VT = MVT::i64; break; - case 4: VT = MVT::i32; break; - case 2: VT = MVT::i16; break; - default: VT = MVT::i8; break; - } - } - + // Use the largest integer type whose alignment constraints are satisfied. + // We only need to check DstAlign here as SrcAlign is always greater or + // equal to DstAlign (or zero). + VT = MVT::i64; + while (DstAlign && DstAlign < VT.getSizeInBits() / 8 && + !TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign)) + VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1); + assert(VT.isInteger()); + + // Find the largest legal integer type. MVT LVT = MVT::i64; while (!TLI.isTypeLegal(LVT)) LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1); assert(LVT.isInteger()); + // If the type we've chosen is larger than the largest legal integer type + // then use that instead. if (VT.bitsGT(LVT)) VT = LVT; } diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 4bfb9b01656..62e774d869d 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -12147,12 +12147,6 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, } } - // Lowering to i32/i16 if the size permits. - if (Size >= 4) - return MVT::i32; - else if (Size >= 2) - return MVT::i16; - // Let the target-independent logic figure it out. return MVT::Other; } diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll index 436e49b9f39..b447497b270 100644 --- a/test/CodeGen/ARM/memcpy-inline.ll +++ b/test/CodeGen/ARM/memcpy-inline.ll @@ -95,10 +95,7 @@ entry: ; CHECK: movt [[REG7:r[0-9]+]], #22866 ; CHECK: str [[REG7]] ; CHECK-T1-LABEL: t5: -; CHECK-T1: movs [[TREG3:r[0-9]]], -; CHECK-T1: strb [[TREG3]], -; CHECK-T1: movs [[TREG4:r[0-9]]], -; CHECK-T1: strb [[TREG4]], +; CHECK-T1: bl _memcpy tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false) ret void } diff --git a/test/CodeGen/ARM/memset-inline.ll b/test/CodeGen/ARM/memset-inline.ll index e1d28b98f19..b2bd257701d 100644 --- a/test/CodeGen/ARM/memset-inline.ll +++ b/test/CodeGen/ARM/memset-inline.ll @@ -38,6 +38,56 @@ entry: ret void } +define void @t3(i8* %p) { +entry: +; CHECK-7A-LABEL: t3: +; CHECK-7A: muls [[REG:r[0-9]+]], +; CHECK-7A: str [[REG]], +; CHECK-6M-LABEL: t3: +; CHECK-6M-NOT: muls +; CHECK-6M: strb [[REG:r[0-9]+]], +; CHECK-6M: strb [[REG]], +; CHECK-6M: strb [[REG]], +; CHECK-6M: strb [[REG]], + br label %for.body + +for.body: + %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %0 = trunc i32 %i to i8 + call void @llvm.memset.p0i8.i32(i8* %p, i8 %0, i32 4, i32 1, i1 false) + call void @something(i8* %p) + %inc = add nuw nsw i32 %i, 1 + %exitcond = icmp eq i32 %inc, 255 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @t4(i8* %p) { +entry: +; CHECK-7A-LABEL: t4: +; CHECK-7A: muls [[REG:r[0-9]+]], +; CHECK-7A: str [[REG]], +; CHECK-6M-LABEL: t4: +; CHECK-6M: muls [[REG:r[0-9]+]], +; CHECK-6M: strh [[REG]], +; CHECK-6M: strh [[REG]], + br label %for.body + +for.body: + %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %0 = trunc i32 %i to i8 + call void @llvm.memset.p0i8.i32(i8* %p, i8 %0, i32 4, i32 2, i1 false) + call void @something(i8* %p) + %inc = add nuw nsw i32 %i, 1 + %exitcond = icmp eq i32 %inc, 255 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + declare void @something(i8*) nounwind declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind