From: John Brawn <john.brawn@arm.com>
Date: Fri, 26 May 2017 13:59:12 +0000 (+0000)
Subject: [ARM] Fix lowering of misaligned memcpy/memset
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=bafb2e66d85af00ba60213496cb2e5f1415f514c;p=llvm

[ARM] Fix lowering of misaligned memcpy/memset

Currently getOptimalMemOpType returns i32 for large enough sizes without
checking for alignment, leading to poor code generation when misaligned accesses
aren't permitted as we generate a word store then later split it up into byte
stores. This means we inadvertantly go over the MaxStoresPerMemcpy limit and for
memset we splat the memset value into a word then immediately split it up
again.

Fix this by leaving it up to FindOptimalMemOpLowering to figure out which type
to use, but also fix a bug there where it wasn't correctly checking if
misaligned memory accesses are allowed.

Differential Revision: https://reviews.llvm.org/D33442


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303990 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b26f1ce8c5e..177898e1e95 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4779,23 +4779,23 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
                                    DAG.getMachineFunction());
 
   if (VT == MVT::Other) {
-    if (DstAlign >= DAG.getDataLayout().getPointerPrefAlignment(DstAS) ||
-        TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign)) {
-      VT = TLI.getPointerTy(DAG.getDataLayout(), DstAS);
-    } else {
-      switch (DstAlign & 7) {
-      case 0:  VT = MVT::i64; break;
-      case 4:  VT = MVT::i32; break;
-      case 2:  VT = MVT::i16; break;
-      default: VT = MVT::i8;  break;
-      }
-    }
-
+    // Use the largest integer type whose alignment constraints are satisfied.
+    // We only need to check DstAlign here as SrcAlign is always greater or
+    // equal to DstAlign (or zero).
+    VT = MVT::i64;
+    while (DstAlign && DstAlign < VT.getSizeInBits() / 8 &&
+           !TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign))
+      VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
+    assert(VT.isInteger());
+
+    // Find the largest legal integer type.
     MVT LVT = MVT::i64;
     while (!TLI.isTypeLegal(LVT))
       LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1);
     assert(LVT.isInteger());
 
+    // If the type we've chosen is larger than the largest legal integer type
+    // then use that instead.
     if (VT.bitsGT(LVT))
       VT = LVT;
   }
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 4bfb9b01656..62e774d869d 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -12147,12 +12147,6 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
     }
   }
 
-  // Lowering to i32/i16 if the size permits.
-  if (Size >= 4)
-    return MVT::i32;
-  else if (Size >= 2)
-    return MVT::i16;
-
   // Let the target-independent logic figure it out.
   return MVT::Other;
 }
diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll
index 436e49b9f39..b447497b270 100644
--- a/test/CodeGen/ARM/memcpy-inline.ll
+++ b/test/CodeGen/ARM/memcpy-inline.ll
@@ -95,10 +95,7 @@ entry:
 ; CHECK: movt [[REG7:r[0-9]+]], #22866
 ; CHECK: str [[REG7]]
 ; CHECK-T1-LABEL: t5:
-; CHECK-T1: movs [[TREG3:r[0-9]]],
-; CHECK-T1: strb [[TREG3]],
-; CHECK-T1: movs [[TREG4:r[0-9]]],
-; CHECK-T1: strb [[TREG4]],
+; CHECK-T1: bl _memcpy
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
   ret void
 }
diff --git a/test/CodeGen/ARM/memset-inline.ll b/test/CodeGen/ARM/memset-inline.ll
index e1d28b98f19..b2bd257701d 100644
--- a/test/CodeGen/ARM/memset-inline.ll
+++ b/test/CodeGen/ARM/memset-inline.ll
@@ -38,6 +38,56 @@ entry:
   ret void
 }
 
+define void @t3(i8* %p) {
+entry:
+; CHECK-7A-LABEL: t3:
+; CHECK-7A: muls [[REG:r[0-9]+]],
+; CHECK-7A: str  [[REG]],
+; CHECK-6M-LABEL: t3:
+; CHECK-6M-NOT: muls
+; CHECK-6M: strb [[REG:r[0-9]+]],
+; CHECK-6M: strb [[REG]],
+; CHECK-6M: strb [[REG]],
+; CHECK-6M: strb [[REG]],
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = trunc i32 %i to i8
+  call void @llvm.memset.p0i8.i32(i8* %p, i8 %0, i32 4, i32 1, i1 false)
+  call void @something(i8* %p)
+  %inc = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc, 255
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @t4(i8* %p) {
+entry:
+; CHECK-7A-LABEL: t4:
+; CHECK-7A: muls [[REG:r[0-9]+]],
+; CHECK-7A: str  [[REG]],
+; CHECK-6M-LABEL: t4:
+; CHECK-6M: muls [[REG:r[0-9]+]],
+; CHECK-6M: strh [[REG]],
+; CHECK-6M: strh [[REG]],
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = trunc i32 %i to i8
+  call void @llvm.memset.p0i8.i32(i8* %p, i8 %0, i32 4, i32 2, i1 false)
+  call void @something(i8* %p)
+  %inc = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc, 255
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 declare void @something(i8*) nounwind
 declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind