[IR] Match intrinsic parameter by scalar/vectorwidth

author Simon Pilgrim <llvm-dev@redking.me.uk>

Wed, 23 Jan 2019 16:00:22 +0000 (16:00 +0000)

committer Simon Pilgrim <llvm-dev@redking.me.uk>

Wed, 23 Jan 2019 16:00:22 +0000 (16:00 +0000)
author Simon Pilgrim <llvm-dev@redking.me.uk>
Wed, 23 Jan 2019 16:00:22 +0000 (16:00 +0000)
committer Simon Pilgrim <llvm-dev@redking.me.uk>
Wed, 23 Jan 2019 16:00:22 +0000 (16:00 +0000)
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td

index 6123a782c3120838b10c480d94c00ab4f39854b5..5f256275b87a133be3786786b11354d24faaff38 100644 (file)
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -156,10 +156,15 @@ class LLVMMatchType<int num>
  // the intrinsic is overloaded, so the matched type should be declared as iAny.
  class LLVMExtendedType<int num> : LLVMMatchType<num>;
  class LLVMTruncatedType<int num> : LLVMMatchType<num>;
-class LLVMVectorSameWidth<int num, LLVMType elty>
-  : LLVMMatchType<num> {
+
+// Match the scalar/vector of another intrinsic parameter but with a different
+// element type. Either both are scalars or both are vectors with the same
+// number of elements.
+class LLVMScalarOrSameVectorWidth<int idx, LLVMType elty>
+  : LLVMMatchType<idx> {
    ValueType ElTy = elty.VT;
  }
+
  class LLVMPointerTo<int num> : LLVMMatchType<num>;
  class LLVMPointerToElt<int num> : LLVMMatchType<num>;
  class LLVMVectorOfAnyPointersToElt<int num> : LLVMMatchType<num>;
@@ -796,24 +801,30 @@ def int_adjust_trampoline : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty],
  //
  
  // Expose the carry flag from add operations on two integrals.
-def int_sadd_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
+def int_sadd_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                         [LLVMMatchType<0>, LLVMMatchType<0>],
                                         [IntrNoMem, IntrSpeculatable]>;
-def int_uadd_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
+def int_uadd_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                         [LLVMMatchType<0>, LLVMMatchType<0>],
                                         [IntrNoMem, IntrSpeculatable]>;
  
-def int_ssub_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
+def int_ssub_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                         [LLVMMatchType<0>, LLVMMatchType<0>],
                                         [IntrNoMem, IntrSpeculatable]>;
-def int_usub_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
+def int_usub_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                         [LLVMMatchType<0>, LLVMMatchType<0>],
                                         [IntrNoMem, IntrSpeculatable]>;
  
-def int_smul_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
+def int_smul_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                         [LLVMMatchType<0>, LLVMMatchType<0>],
                                         [IntrNoMem, IntrSpeculatable]>;
-def int_umul_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
+def int_umul_with_overflow : Intrinsic<[llvm_anyint_ty,
+                                        LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                         [LLVMMatchType<0>, LLVMMatchType<0>],
                                         [IntrNoMem, IntrSpeculatable]>;
  
@@ -1001,35 +1012,35 @@ def int_is_constant : Intrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem], "llvm.
  def int_masked_store : Intrinsic<[], [llvm_anyvector_ty,
                                        LLVMAnyPointerType<LLVMMatchType<0>>,
                                        llvm_i32_ty,
-                                      LLVMVectorSameWidth<0, llvm_i1_ty>],
+                                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                   [IntrArgMemOnly]>;
  
  def int_masked_load  : Intrinsic<[llvm_anyvector_ty],
                                   [LLVMAnyPointerType<LLVMMatchType<0>>, llvm_i32_ty,
-                                  LLVMVectorSameWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
+                                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
                                   [IntrReadMem, IntrArgMemOnly]>;
  
  def int_masked_gather: Intrinsic<[llvm_anyvector_ty],
                                   [LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
-                                  LLVMVectorSameWidth<0, llvm_i1_ty>,
+                                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                    LLVMMatchType<0>],
                                   [IntrReadMem]>;
  
  def int_masked_scatter: Intrinsic<[],
                                    [llvm_anyvector_ty,
                                     LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
-                                   LLVMVectorSameWidth<0, llvm_i1_ty>]>;
+                                   LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>]>;
  
  def int_masked_expandload: Intrinsic<[llvm_anyvector_ty],
                                       [LLVMPointerToElt<0>,
-                                      LLVMVectorSameWidth<0, llvm_i1_ty>,
+                                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                        LLVMMatchType<0>],
                                       [IntrReadMem]>;
  
  def int_masked_compressstore: Intrinsic<[],
                                       [llvm_anyvector_ty,
                                        LLVMPointerToElt<0>,
-                                      LLVMVectorSameWidth<0, llvm_i1_ty>],
+                                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                       [IntrArgMemOnly]>;
  
  // Test whether a pointer is associated with a type metadata identifier.
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp

index 8601ec015a58e637c8bfc24163075a34bf60da0d..3ec421fe8813b4ce2b121f7e620b7fa3b2e16ada 100644 (file)
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -948,10 +948,9 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
    case IITDescriptor::SameVecWidthArgument: {
      Type *EltTy = DecodeFixedType(Infos, Tys, Context);
      Type *Ty = Tys[D.getArgumentNumber()];
-    if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    if (auto *VTy = dyn_cast<VectorType>(Ty))
        return VectorType::get(EltTy, VTy->getNumElements());
-    }
-    llvm_unreachable("unhandled");
+    return EltTy;
    }
    case IITDescriptor::PtrToArgument: {
      Type *Ty = Tys[D.getArgumentNumber()];
@@ -1135,15 +1134,19 @@ bool Intrinsic::matchIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor>
      case IITDescriptor::SameVecWidthArgument: {
        if (D.getArgumentNumber() >= ArgTys.size())
          return true;
-      VectorType * ReferenceType =
-        dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
-      VectorType *ThisArgType = dyn_cast<VectorType>(Ty);
-      if (!ThisArgType || !ReferenceType ||
-          (ReferenceType->getVectorNumElements() !=
-           ThisArgType->getVectorNumElements()))
+      auto *ReferenceType = dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
+      auto *ThisArgType = dyn_cast<VectorType>(Ty);
+      // Both must be vectors of the same number of elements or neither.
+      if ((ReferenceType != nullptr) != (ThisArgType != nullptr))
          return true;
-      return matchIntrinsicType(ThisArgType->getVectorElementType(),
-                                Infos, ArgTys);
+      Type *EltTy = Ty;
+      if (ThisArgType) {
+        if (ReferenceType->getVectorNumElements() !=
+            ThisArgType->getVectorNumElements())
+          return true;
+        EltTy = ThisArgType->getVectorElementType();
+      }
+      return matchIntrinsicType(EltTy, Infos, ArgTys);
      }
      case IITDescriptor::PtrToArgument: {
        if (D.getArgumentNumber() >= ArgTys.size())
diff --git a/test/Analysis/CostModel/X86/arith-overflow.ll b/test/Analysis/CostModel/X86/arith-overflow.ll

new file mode 100644 (file)

index 0000000..2b5c9e9
--- /dev/null
+++ b/test/Analysis/CostModel/X86/arith-overflow.ll
@@ -0,0 +1,414 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
+;
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,GLM
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,BTVER2
+
+;
+; sadd.with.overflow
+;
+
+declare {i64, i1}              @llvm.sadd.with.overflow.i64(i64, i64)
+declare {<2 x i64>, <2 x i1>}  @llvm.sadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+declare {<4 x i64>, <4 x i1>}  @llvm.sadd.with.overflow.v4i64(<4 x i64>, <4 x i64>)
+declare {<8 x i64>, <8 x i1>}  @llvm.sadd.with.overflow.v8i64(<8 x i64>, <8 x i64>)
+
+declare {i32, i1}               @llvm.sadd.with.overflow.i32(i32, i32)
+declare {<4 x i32>, <4 x i1>}   @llvm.sadd.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<8 x i32>, <8 x i1>}   @llvm.sadd.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {i16, i1}               @llvm.sadd.with.overflow.i16(i16, i16)
+declare {<8 x i16>,  <8 x i1>}  @llvm.sadd.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<16 x i16>, <16 x i1>} @llvm.sadd.with.overflow.v16i16(<16 x i16>, <16 x i16>)
+declare {<32 x i16>, <32 x i1>} @llvm.sadd.with.overflow.v32i16(<32 x i16>, <32 x i16>)
+
+declare {i8, i1}                @llvm.sadd.with.overflow.i8(i8, i8)
+declare {<16 x i8>, <16 x i1>}  @llvm.sadd.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<32 x i8>, <32 x i1>}  @llvm.sadd.with.overflow.v32i8(<32 x i8>, <32 x i8>)
+declare {<64 x i8>, <64 x i1>}  @llvm.sadd.with.overflow.v64i8(<64 x i8>, <64 x i8>)
+
+define i32 @sadd(i32 %arg) {
+; CHECK-LABEL: 'sadd'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I64 = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+  %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+  %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+  %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+
+  %I32 = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+  %V4I32  = call {<4 x i32>, <4 x i1>}  @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+  %V8I32  = call {<8 x i32>, <8 x i1>}  @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+  %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+
+  %I16 = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+  %V8I16  = call {<8 x i16>, <8 x i1>}  @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+  %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+  %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+
+  %I8 = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+  %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+  %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+  %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+
+  ret i32 undef
+}
+
+;
+; uadd.with.overflow
+;
+
+declare {i64, i1}              @llvm.uadd.with.overflow.i64(i64, i64)
+declare {<2 x i64>, <2 x i1>}  @llvm.uadd.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+declare {<4 x i64>, <4 x i1>}  @llvm.uadd.with.overflow.v4i64(<4 x i64>, <4 x i64>)
+declare {<8 x i64>, <8 x i1>}  @llvm.uadd.with.overflow.v8i64(<8 x i64>, <8 x i64>)
+
+declare {i32, i1}               @llvm.uadd.with.overflow.i32(i32, i32)
+declare {<4 x i32>, <4 x i1>}   @llvm.uadd.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<8 x i32>, <8 x i1>}   @llvm.uadd.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {i16, i1}               @llvm.uadd.with.overflow.i16(i16, i16)
+declare {<8 x i16>,  <8 x i1>}  @llvm.uadd.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<16 x i16>, <16 x i1>} @llvm.uadd.with.overflow.v16i16(<16 x i16>, <16 x i16>)
+declare {<32 x i16>, <32 x i1>} @llvm.uadd.with.overflow.v32i16(<32 x i16>, <32 x i16>)
+
+declare {i8, i1}                @llvm.uadd.with.overflow.i8(i8, i8)
+declare {<16 x i8>, <16 x i1>}  @llvm.uadd.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<32 x i8>, <32 x i1>}  @llvm.uadd.with.overflow.v32i8(<32 x i8>, <32 x i8>)
+declare {<64 x i8>, <64 x i1>}  @llvm.uadd.with.overflow.v64i8(<64 x i8>, <64 x i8>)
+
+define i32 @uadd(i32 %arg) {
+; CHECK-LABEL: 'uadd'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I64 = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+  %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+  %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+  %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+
+  %I32 = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
+  %V4I32  = call {<4 x i32>, <4 x i1>}  @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+  %V8I32  = call {<8 x i32>, <8 x i1>}  @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+  %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+
+  %I16 = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
+  %V8I16  = call {<8 x i16>, <8 x i1>}  @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+  %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+  %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+
+  %I8 = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
+  %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+  %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+  %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+
+  ret i32 undef
+}
+
+;
+; ssub.with.overflow
+;
+
+declare {i64, i1}              @llvm.ssub.with.overflow.i64(i64, i64)
+declare {<2 x i64>, <2 x i1>}  @llvm.ssub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+declare {<4 x i64>, <4 x i1>}  @llvm.ssub.with.overflow.v4i64(<4 x i64>, <4 x i64>)
+declare {<8 x i64>, <8 x i1>}  @llvm.ssub.with.overflow.v8i64(<8 x i64>, <8 x i64>)
+
+declare {i32, i1}               @llvm.ssub.with.overflow.i32(i32, i32)
+declare {<4 x i32>, <4 x i1>}   @llvm.ssub.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<8 x i32>, <8 x i1>}   @llvm.ssub.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {i16, i1}               @llvm.ssub.with.overflow.i16(i16, i16)
+declare {<8 x i16>,  <8 x i1>}  @llvm.ssub.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<16 x i16>, <16 x i1>} @llvm.ssub.with.overflow.v16i16(<16 x i16>, <16 x i16>)
+declare {<32 x i16>, <32 x i1>} @llvm.ssub.with.overflow.v32i16(<32 x i16>, <32 x i16>)
+
+declare {i8, i1}                @llvm.ssub.with.overflow.i8(i8, i8)
+declare {<16 x i8>, <16 x i1>}  @llvm.ssub.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<32 x i8>, <32 x i1>}  @llvm.ssub.with.overflow.v32i8(<32 x i8>, <32 x i8>)
+declare {<64 x i8>, <64 x i1>}  @llvm.ssub.with.overflow.v64i8(<64 x i8>, <64 x i8>)
+
+define i32 @ssub(i32 %arg) {
+; CHECK-LABEL: 'ssub'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I64 = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+  %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+  %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+  %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+
+  %I32 = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+  %V4I32  = call {<4 x i32>, <4 x i1>}  @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+  %V8I32  = call {<8 x i32>, <8 x i1>}  @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+  %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+
+  %I16 = call {i16, i1} @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+  %V8I16  = call {<8 x i16>, <8 x i1>}  @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+  %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+  %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+
+  %I8 = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+  %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+  %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+  %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+
+  ret i32 undef
+}
+
+;
+; usub.with.overflow
+;
+
+declare {i64, i1}              @llvm.usub.with.overflow.i64(i64, i64)
+declare {<2 x i64>, <2 x i1>}  @llvm.usub.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+declare {<4 x i64>, <4 x i1>}  @llvm.usub.with.overflow.v4i64(<4 x i64>, <4 x i64>)
+declare {<8 x i64>, <8 x i1>}  @llvm.usub.with.overflow.v8i64(<8 x i64>, <8 x i64>)
+
+declare {i32, i1}               @llvm.usub.with.overflow.i32(i32, i32)
+declare {<4 x i32>, <4 x i1>}   @llvm.usub.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<8 x i32>, <8 x i1>}   @llvm.usub.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {i16, i1}               @llvm.usub.with.overflow.i16(i16, i16)
+declare {<8 x i16>,  <8 x i1>}  @llvm.usub.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<16 x i16>, <16 x i1>} @llvm.usub.with.overflow.v16i16(<16 x i16>, <16 x i16>)
+declare {<32 x i16>, <32 x i1>} @llvm.usub.with.overflow.v32i16(<32 x i16>, <32 x i16>)
+
+declare {i8, i1}                @llvm.usub.with.overflow.i8(i8, i8)
+declare {<16 x i8>, <16 x i1>}  @llvm.usub.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<32 x i8>, <32 x i1>}  @llvm.usub.with.overflow.v32i8(<32 x i8>, <32 x i8>)
+declare {<64 x i8>, <64 x i1>}  @llvm.usub.with.overflow.v64i8(<64 x i8>, <64 x i8>)
+
+define i32 @usub(i32 %arg) {
+; CHECK-LABEL: 'usub'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I64 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+  %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+  %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+  %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+
+  %I32 = call {i32, i1} @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
+  %V4I32  = call {<4 x i32>, <4 x i1>}  @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+  %V8I32  = call {<8 x i32>, <8 x i1>}  @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+  %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+
+  %I16 = call {i16, i1} @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
+  %V8I16  = call {<8 x i16>, <8 x i1>}  @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+  %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+  %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+
+  %I8 = call {i8, i1} @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
+  %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+  %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+  %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+
+  ret i32 undef
+}
+
+;
+; smul.with.overflow
+;
+
+declare {i64, i1}              @llvm.smul.with.overflow.i64(i64, i64)
+declare {<2 x i64>, <2 x i1>}  @llvm.smul.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+declare {<4 x i64>, <4 x i1>}  @llvm.smul.with.overflow.v4i64(<4 x i64>, <4 x i64>)
+declare {<8 x i64>, <8 x i1>}  @llvm.smul.with.overflow.v8i64(<8 x i64>, <8 x i64>)
+
+declare {i32, i1}               @llvm.smul.with.overflow.i32(i32, i32)
+declare {<4 x i32>, <4 x i1>}   @llvm.smul.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<8 x i32>, <8 x i1>}   @llvm.smul.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.smul.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {i16, i1}               @llvm.smul.with.overflow.i16(i16, i16)
+declare {<8 x i16>,  <8 x i1>}  @llvm.smul.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<16 x i16>, <16 x i1>} @llvm.smul.with.overflow.v16i16(<16 x i16>, <16 x i16>)
+declare {<32 x i16>, <32 x i1>} @llvm.smul.with.overflow.v32i16(<32 x i16>, <32 x i16>)
+
+declare {i8, i1}                @llvm.smul.with.overflow.i8(i8, i8)
+declare {<16 x i8>, <16 x i1>}  @llvm.smul.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<32 x i8>, <32 x i1>}  @llvm.smul.with.overflow.v32i8(<32 x i8>, <32 x i8>)
+declare {<64 x i8>, <64 x i1>}  @llvm.smul.with.overflow.v64i8(<64 x i8>, <64 x i8>)
+
+define i32 @smul(i32 %arg) {
+; CHECK-LABEL: 'smul'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I64 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
+  %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+  %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+  %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+
+  %I32 = call {i32, i1} @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
+  %V4I32  = call {<4 x i32>, <4 x i1>}  @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+  %V8I32  = call {<8 x i32>, <8 x i1>}  @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+  %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+
+  %I16 = call {i16, i1} @llvm.smul.with.overflow.i16(i16 undef, i16 undef)
+  %V8I16  = call {<8 x i16>, <8 x i1>}  @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+  %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+  %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+
+  %I8 = call {i8, i1} @llvm.smul.with.overflow.i8(i8 undef, i8 undef)
+  %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+  %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+  %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+
+  ret i32 undef
+}
+
+;
+; umul.with.overflow
+;
+
+declare {i64, i1}              @llvm.umul.with.overflow.i64(i64, i64)
+declare {<2 x i64>, <2 x i1>}  @llvm.umul.with.overflow.v2i64(<2 x i64>, <2 x i64>)
+declare {<4 x i64>, <4 x i1>}  @llvm.umul.with.overflow.v4i64(<4 x i64>, <4 x i64>)
+declare {<8 x i64>, <8 x i1>}  @llvm.umul.with.overflow.v8i64(<8 x i64>, <8 x i64>)
+
+declare {i32, i1}               @llvm.umul.with.overflow.i32(i32, i32)
+declare {<4 x i32>, <4 x i1>}   @llvm.umul.with.overflow.v4i32(<4 x i32>, <4 x i32>)
+declare {<8 x i32>, <8 x i1>}   @llvm.umul.with.overflow.v8i32(<8 x i32>, <8 x i32>)
+declare {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32>, <16 x i32>)
+
+declare {i16, i1}               @llvm.umul.with.overflow.i16(i16, i16)
+declare {<8 x i16>,  <8 x i1>}  @llvm.umul.with.overflow.v8i16(<8 x i16>, <8 x i16>)
+declare {<16 x i16>, <16 x i1>} @llvm.umul.with.overflow.v16i16(<16 x i16>, <16 x i16>)
+declare {<32 x i16>, <32 x i1>} @llvm.umul.with.overflow.v32i16(<32 x i16>, <32 x i16>)
+
+declare {i8, i1}                @llvm.umul.with.overflow.i8(i8, i8)
+declare {<16 x i8>, <16 x i1>}  @llvm.umul.with.overflow.v16i8(<16 x i8>, <16 x i8>)
+declare {<32 x i8>, <32 x i1>}  @llvm.umul.with.overflow.v32i8(<32 x i8>, <32 x i8>)
+declare {<64 x i8>, <64 x i1>}  @llvm.umul.with.overflow.v64i8(<64 x i8>, <64 x i8>)
+
+define i32 @umul(i32 %arg) {
+; CHECK-LABEL: 'umul'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %I64 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
+  %V2I64 = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+  %V4I64 = call {<4 x i64>, <4 x i1>} @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+  %V8I64 = call {<8 x i64>, <8 x i1>} @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+
+  %I32 = call {i32, i1} @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
+  %V4I32  = call {<4 x i32>, <4 x i1>}  @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+  %V8I32  = call {<8 x i32>, <8 x i1>}  @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+  %V16I32 = call {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+
+  %I16 = call {i16, i1} @llvm.umul.with.overflow.i16(i16 undef, i16 undef)
+  %V8I16  = call {<8 x i16>, <8 x i1>}  @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+  %V16I16 = call {<16 x i16>, <16 x i1>} @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+  %V32I16 = call {<32 x i16>, <32 x i1>} @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+
+  %I8 = call {i8, i1} @llvm.umul.with.overflow.i8(i8 undef, i8 undef)
+  %V16I8 = call {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+  %V32I8 = call {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+  %V64I8 = call {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+
+  ret i32 undef
+}
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp

index adca99552b145ef397c0e901f90ddc38a69515af..5dcb5e230ccbd8e57a943b0961dffa284ceaf57d 100644 (file)
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -633,7 +633,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
        // overloaded, all the types can be specified directly.
        assert(((!TyEl->isSubClassOf("LLVMExtendedType") &&
                 !TyEl->isSubClassOf("LLVMTruncatedType") &&
-               !TyEl->isSubClassOf("LLVMVectorSameWidth")) ||
+               !TyEl->isSubClassOf("LLVMScalarOrSameVectorWidth")) ||
                VT == MVT::iAny || VT == MVT::vAny) &&
               "Expected iAny or vAny type");
      } else
diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp

index cb76f053a229f890890645e994e766ecccf6ecc7..3c0726308a41c63c7a08980dd023955cb5edeba3 100644 (file)
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@@ -269,7 +269,7 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
        Sig.push_back(IIT_TRUNC_ARG);
      else if (R->isSubClassOf("LLVMHalfElementsVectorType"))
        Sig.push_back(IIT_HALF_VEC_ARG);
-    else if (R->isSubClassOf("LLVMVectorSameWidth")) {
+    else if (R->isSubClassOf("LLVMScalarOrSameVectorWidth")) {
        Sig.push_back(IIT_SAME_VEC_WIDTH_ARG);
        Sig.push_back((Number << 3) | ArgCodes[Number]);
        MVT::SimpleValueType VT = getValueType(R->getValueAsDef("ElTy"));
author	Simon Pilgrim <llvm-dev@redking.me.uk>
	Wed, 23 Jan 2019 16:00:22 +0000 (16:00 +0000)
committer	Simon Pilgrim <llvm-dev@redking.me.uk>
	Wed, 23 Jan 2019 16:00:22 +0000 (16:00 +0000)
include/llvm/IR/Intrinsics.td		patch \| blob \| history
lib/IR/Function.cpp		patch \| blob \| history
test/Analysis/CostModel/X86/arith-overflow.ll	[new file with mode: 0644]	patch \| blob
utils/TableGen/CodeGenTarget.cpp		patch \| blob \| history
utils/TableGen/IntrinsicEmitter.cpp		patch \| blob \| history