From: George Burgess IV <george.burgess.iv@gmail.com>
Date: Fri, 9 Jun 2017 03:56:15 +0000 (+0000)
Subject: [LoopVectorize] Don't preserve nsw/nuw flags on shrunken ops.
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9276050d307c92ba88256975fe16a1ef6ca3f0c8;p=llvm

[LoopVectorize] Don't preserve nsw/nuw flags on shrunken ops.

If we're shrinking a binary operation, it may be the case that the new
operations wraps where the old didn't. If this happens, the behavior
should be well-defined. So, we can't always carry wrapping flags with us
when we shrink operations.

If we do, we get incorrect optimizations in cases like:

void foo(const unsigned char *from, unsigned char *to, int n) {
  for (int i = 0; i < n; i++)
    to[i] = from[i] - 128;
}

which gets optimized to:

void foo(const unsigned char *from, unsigned char *to, int n) {
  for (int i = 0; i < n; i++)
    to[i] = from[i] | 128;
}

Because:
- InstCombine turned `sub i32 %from.i, 128` into
  `add nuw nsw i32 %from.i, 128`.
- LoopVectorize vectorized the add to be `add nuw nsw <16 x i8>` with a
  vector full of `i8 128`s
- InstCombine took advantage of the fact that the newly-shrunken add
  "couldn't wrap", and changed the `add` to an `or`.

InstCombine seems happy to figure out whether we can add nuw/nsw on its
own, so I just decided to drop the flags. There are already a number of
places in LoopVectorize where we rely on InstCombine to clean up.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@305053 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h
index 41aafd92295..00c431834e3 100644
--- a/include/llvm/IR/Instruction.h
+++ b/include/llvm/IR/Instruction.h
@@ -360,9 +360,9 @@ public:
   /// Copy I's fast-math flags
   void copyFastMathFlags(const Instruction *I);
 
-  /// Convenience method to copy supported wrapping, exact, and fast-math flags
-  /// from V to this instruction.
-  void copyIRFlags(const Value *V);
+  /// Convenience method to copy supported exact, fast-math, and (optionally)
+  /// wrapping flags from V to this instruction.
+  void copyIRFlags(const Value *V, bool IncludeWrapFlags = true);
 
   /// Logical 'and' of any supported wrapping, exact, and fast-math flags of
   /// V and this instruction.
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index 000661073b1..3dd653d2d04 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -216,10 +216,10 @@ void Instruction::copyFastMathFlags(const Instruction *I) {
   copyFastMathFlags(I->getFastMathFlags());
 }
 
-void Instruction::copyIRFlags(const Value *V) {
+void Instruction::copyIRFlags(const Value *V, bool IncludeWrapFlags) {
   // Copy the wrapping flags.
-  if (auto *OB = dyn_cast<OverflowingBinaryOperator>(V)) {
-    if (isa<OverflowingBinaryOperator>(this)) {
+  if (IncludeWrapFlags && isa<OverflowingBinaryOperator>(this)) {
+    if (auto *OB = dyn_cast<OverflowingBinaryOperator>(V)) {
       setHasNoSignedWrap(OB->hasNoSignedWrap());
       setHasNoUnsignedWrap(OB->hasNoUnsignedWrap());
     }
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4040be10a14..1abdb248485 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3814,7 +3814,11 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
                              ShrinkOperand(BO->getOperand(1)));
-        cast<BinaryOperator>(NewI)->copyIRFlags(I);
+
+        // Any wrapping introduced by shrinking this operation shouldn't be
+        // considered undefined behavior. So, we can't unconditionally copy
+        // arithmetic wrapping flags to NewI.
+        cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
         NewI =
             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
diff --git a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index d06e3fdba39..1149afe7b9f 100644
--- a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -5,7 +5,7 @@ target triple = "aarch64"
 
 ; CHECK-LABEL: @add_a(
 ; CHECK: load <16 x i8>, <16 x i8>*
-; CHECK: add nuw nsw <16 x i8>
+; CHECK: add <16 x i8>
 ; CHECK: store <16 x i8>
 ; Function Attrs: nounwind
 define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
@@ -31,9 +31,37 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
+; Ensure that we preserve nuw/nsw if we're not shrinking the values we're
+; working with.
+; CHECK-LABEL: @add_a1(
+; CHECK: load <16 x i8>, <16 x i8>*
+; CHECK: add nuw nsw <16 x i8>
+; CHECK: store <16 x i8>
+; Function Attrs: nounwind
+define void @add_a1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
+entry:
+  %cmp8 = icmp sgt i32 %len, 0
+  br i1 %cmp8, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx
+  %add = add nuw nsw i8 %0, 2
+  %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
+  store i8 %add, i8* %arrayidx3
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %len
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
 ; CHECK-LABEL: @add_b(
 ; CHECK: load <8 x i16>, <8 x i16>*
-; CHECK: add nuw nsw <8 x i16>
+; CHECK: add <8 x i16>
 ; CHECK: store <8 x i16>
 ; Function Attrs: nounwind
 define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
@@ -61,7 +89,7 @@ for.body:                                         ; preds = %entry, %for.body
 
 ; CHECK-LABEL: @add_c(
 ; CHECK: load <8 x i8>, <8 x i8>*
-; CHECK: add nuw nsw <8 x i16>
+; CHECK: add <8 x i16>
 ; CHECK: store <8 x i16>
 ; Function Attrs: nounwind
 define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
@@ -116,12 +144,12 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK-LABEL: @add_e(
 ; CHECK: load <16 x i8>
 ; CHECK: shl <16 x i8>
-; CHECK: add nuw nsw <16 x i8>
+; CHECK: add <16 x i8>
 ; CHECK: or <16 x i8>
-; CHECK: mul nuw nsw <16 x i8>
+; CHECK: mul <16 x i8>
 ; CHECK: and <16 x i8>
 ; CHECK: xor <16 x i8>
-; CHECK: mul nuw nsw <16 x i8>
+; CHECK: mul <16 x i8>
 ; CHECK: store <16 x i8>
 define void @add_e(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
 entry:
@@ -162,12 +190,12 @@ for.body:                                         ; preds = %for.body, %for.body
 ; CHECK: load <8 x i16>
 ; CHECK: trunc <8 x i16>
 ; CHECK: shl <8 x i8>
-; CHECK: add nsw <8 x i8>
+; CHECK: add <8 x i8>
 ; CHECK: or <8 x i8>
-; CHECK: mul nuw nsw <8 x i8>
+; CHECK: mul <8 x i8>
 ; CHECK: and <8 x i8>
 ; CHECK: xor <8 x i8>
-; CHECK: mul nuw nsw <8 x i8>
+; CHECK: mul <8 x i8>
 ; CHECK: store <8 x i8>
 define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
 entry: