[SLP] avoid reduction transform on patterns that the backend can load-combine

author Sanjay Patel <spatel@rotateright.com>

Sat, 5 Oct 2019 18:03:58 +0000 (18:03 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Sat, 5 Oct 2019 18:03:58 +0000 (18:03 +0000)
author Sanjay Patel <spatel@rotateright.com>
Sat, 5 Oct 2019 18:03:58 +0000 (18:03 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Sat, 5 Oct 2019 18:03:58 +0000 (18:03 +0000)
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h

index 6da2d7f43bc42d59d0838084d46f2d4dc210496f..67e62f6f29bcb01c1501de1fc37c5a5d6cab7e31 100644 (file)
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -1129,6 +1129,16 @@ private:
    /// Returns -1 if the cost is unknown.
    int getInstructionThroughput(const Instruction *I) const;
  
+  /// Given an input value that is an element of an 'or' reduction, check if the
+  /// reduction is composed of narrower loaded values. Assuming that a
+  /// legal-sized reduction of shifted/zexted loaded values can be load combined
+  /// in the backend, create a relative cost that accounts for the removal of
+  /// the intermediate ops and replacement by a single wide load.
+  /// TODO: If load combining is allowed in the IR optimizer, this analysis
+  ///       may not be necessary.
+  Optional<int> getLoadCombineCost(unsigned Opcode,
+                                   ArrayRef<const Value *> Args) const;
+
    /// The abstract base class used to type erase specific TTI
    /// implementations.
    class Concept;
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp

index f3d20ce984dbd09a832e4db65079fcf1f8ae1b11..6730aa86a99a15d5812a9a91441d769c331a8e77 100644 (file)
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -571,11 +571,64 @@ TargetTransformInfo::getOperandInfo(Value *V, OperandValueProperties &OpProps) {
    return OpInfo;
  }
  
+Optional<int>
+TargetTransformInfo::getLoadCombineCost(unsigned Opcode,
+                                        ArrayRef<const Value *> Args) const {
+  if (Opcode != Instruction::Or)
+    return llvm::None;
+  if (Args.empty())
+    return llvm::None;
+
+  // Look past the reduction to find a source value. Arbitrarily follow the
+  // path through operand 0 of any 'or'. Also, peek through optional
+  // shift-left-by-constant.
+  const Value *ZextLoad = Args.front();
+  while (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
+         match(ZextLoad, m_Shl(m_Value(), m_Constant())))
+    ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
+
+  // Check if the input to the reduction is an extended load.
+  Value *LoadPtr;
+  if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+    return llvm::None;
+
+  // Require that the total load bit width is a legal integer type.
+  // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
+  // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
+  Type *WideType = ZextLoad->getType();
+  Type *EltType = LoadPtr->getType()->getPointerElementType();
+  unsigned WideWidth = WideType->getIntegerBitWidth();
+  unsigned EltWidth = EltType->getIntegerBitWidth();
+  if (!isTypeLegal(WideType) || WideWidth % EltWidth != 0)
+    return llvm::None;
+
+  // Calculate relative cost: {narrow load+zext+shl+or} are assumed to be
+  // removed and replaced by a single wide load.
+  // FIXME: This is not accurate for the larger pattern where we replace
+  //        multiple narrow load sequences with just 1 wide load. We could
+  //        remove the addition of the wide load cost here and expect the caller
+  //        to make an adjustment for that.
+  int Cost = 0;
+  Cost -= getMemoryOpCost(Instruction::Load, EltType, 0, 0);
+  Cost -= getCastInstrCost(Instruction::ZExt, WideType, EltType);
+  Cost -= getArithmeticInstrCost(Instruction::Shl, WideType);
+  Cost -= getArithmeticInstrCost(Instruction::Or, WideType);
+  Cost += getMemoryOpCost(Instruction::Load, WideType, 0, 0);
+  return Cost;
+}
+
+
  int TargetTransformInfo::getArithmeticInstrCost(
      unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
      OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
      OperandValueProperties Opd2PropInfo,
      ArrayRef<const Value *> Args) const {
+  // Check if we can match this instruction as part of a larger pattern.
+  Optional<int> LoadCombineCost = getLoadCombineCost(Opcode, Args);
+  if (LoadCombineCost)
+    return LoadCombineCost.getValue();
+
+  // Fallback to implementation-specific overrides or base class.
    int Cost = TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
                                               Opd1PropInfo, Opd2PropInfo, Args);
    assert(Cost >= 0 && "TTI should not produce negative costs!");
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp

index 99428c6c5dee384ad2ec64204c3bf1798ee0ec96..ad12646bdeee6e1b02bc179031713b99a4aa1c25 100644 (file)
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6499,10 +6499,19 @@ private:
  
      int ScalarReduxCost = 0;
      switch (ReductionData.getKind()) {
-    case RK_Arithmetic:
-      ScalarReduxCost =
-          TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
+    case RK_Arithmetic: {
+      // Note: Passing in the reduction operands allows the cost model to match
+      //       load combining patterns for this reduction.
+      auto *ReduxInst = cast<Instruction>(ReductionRoot);
+      SmallVector<const Value *, 2> OperandList;
+      for (Value *Operand : ReduxInst->operands())
+        OperandList.push_back(Operand);
+      ScalarReduxCost = TTI->getArithmeticInstrCost(ReductionData.getOpcode(),
+          ScalarTy, TargetTransformInfo::OK_AnyValue,
+          TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
+          TargetTransformInfo::OP_None, OperandList);
        break;
+    }
      case RK_Min:
      case RK_Max:
      case RK_UMin:
diff --git a/test/Transforms/SLPVectorizer/X86/bad-reduction.ll b/test/Transforms/SLPVectorizer/X86/bad-reduction.ll

index e3452e194dbfbf460e5196a4499321092fecd271..c44a8524edfe596b8bf42a989945d40ecc16ab73 100644 (file)
--- a/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
+++ b/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
@@ -15,31 +15,37 @@ define i64 @load_bswap(%v8i8* %p) {
  ; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5
  ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6
  ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[G0]] to <4 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[T0:%.*]] = load i8, i8* [[G0]]
+; CHECK-NEXT:    [[T1:%.*]] = load i8, i8* [[G1]]
+; CHECK-NEXT:    [[T2:%.*]] = load i8, i8* [[G2]]
+; CHECK-NEXT:    [[T3:%.*]] = load i8, i8* [[G3]]
  ; CHECK-NEXT:    [[T4:%.*]] = load i8, i8* [[G4]]
  ; CHECK-NEXT:    [[T5:%.*]] = load i8, i8* [[G5]]
  ; CHECK-NEXT:    [[T6:%.*]] = load i8, i8* [[G6]]
  ; CHECK-NEXT:    [[T7:%.*]] = load i8, i8* [[G7]]
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[Z0:%.*]] = zext i8 [[T0]] to i64
+; CHECK-NEXT:    [[Z1:%.*]] = zext i8 [[T1]] to i64
+; CHECK-NEXT:    [[Z2:%.*]] = zext i8 [[T2]] to i64
+; CHECK-NEXT:    [[Z3:%.*]] = zext i8 [[T3]] to i64
  ; CHECK-NEXT:    [[Z4:%.*]] = zext i8 [[T4]] to i64
  ; CHECK-NEXT:    [[Z5:%.*]] = zext i8 [[T5]] to i64
  ; CHECK-NEXT:    [[Z6:%.*]] = zext i8 [[T6]] to i64
  ; CHECK-NEXT:    [[Z7:%.*]] = zext i8 [[T7]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw <4 x i64> [[TMP3]], <i64 56, i64 48, i64 40, i64 32>
+; CHECK-NEXT:    [[SH0:%.*]] = shl nuw i64 [[Z0]], 56
+; CHECK-NEXT:    [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48
+; CHECK-NEXT:    [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40
+; CHECK-NEXT:    [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32
  ; CHECK-NEXT:    [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24
  ; CHECK-NEXT:    [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16
  ; CHECK-NEXT:    [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP5]], [[SH4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP6]], [[SH5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP7]], [[SH6]]
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z7]]
-; CHECK-NEXT:    ret i64 [[OP_EXTRA]]
+; CHECK-NEXT:    [[OR01:%.*]] = or i64 [[SH0]], [[SH1]]
+; CHECK-NEXT:    [[OR012:%.*]] = or i64 [[OR01]], [[SH2]]
+; CHECK-NEXT:    [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]]
+; CHECK-NEXT:    [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]]
+; CHECK-NEXT:    [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]]
+; CHECK-NEXT:    [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]]
+; CHECK-NEXT:    [[OR01234567:%.*]] = or i64 [[OR0123456]], [[Z7]]
+; CHECK-NEXT:    ret i64 [[OR01234567]]
  ;
    %g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0
    %g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1
@@ -97,18 +103,38 @@ define i64 @load_bswap_nop_shift(%v8i8* %p) {
  ; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5
  ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6
  ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[G0]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 8, i64 0>
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    ret i64 [[TMP5]]
+; CHECK-NEXT:    [[T0:%.*]] = load i8, i8* [[G0]]
+; CHECK-NEXT:    [[T1:%.*]] = load i8, i8* [[G1]]
+; CHECK-NEXT:    [[T2:%.*]] = load i8, i8* [[G2]]
+; CHECK-NEXT:    [[T3:%.*]] = load i8, i8* [[G3]]
+; CHECK-NEXT:    [[T4:%.*]] = load i8, i8* [[G4]]
+; CHECK-NEXT:    [[T5:%.*]] = load i8, i8* [[G5]]
+; CHECK-NEXT:    [[T6:%.*]] = load i8, i8* [[G6]]
+; CHECK-NEXT:    [[T7:%.*]] = load i8, i8* [[G7]]
+; CHECK-NEXT:    [[Z0:%.*]] = zext i8 [[T0]] to i64
+; CHECK-NEXT:    [[Z1:%.*]] = zext i8 [[T1]] to i64
+; CHECK-NEXT:    [[Z2:%.*]] = zext i8 [[T2]] to i64
+; CHECK-NEXT:    [[Z3:%.*]] = zext i8 [[T3]] to i64
+; CHECK-NEXT:    [[Z4:%.*]] = zext i8 [[T4]] to i64
+; CHECK-NEXT:    [[Z5:%.*]] = zext i8 [[T5]] to i64
+; CHECK-NEXT:    [[Z6:%.*]] = zext i8 [[T6]] to i64
+; CHECK-NEXT:    [[Z7:%.*]] = zext i8 [[T7]] to i64
+; CHECK-NEXT:    [[SH0:%.*]] = shl nuw i64 [[Z0]], 56
+; CHECK-NEXT:    [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48
+; CHECK-NEXT:    [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40
+; CHECK-NEXT:    [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32
+; CHECK-NEXT:    [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24
+; CHECK-NEXT:    [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16
+; CHECK-NEXT:    [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8
+; CHECK-NEXT:    [[SH7:%.*]] = shl nuw nsw i64 [[Z7]], 0
+; CHECK-NEXT:    [[OR01:%.*]] = or i64 [[SH0]], [[SH1]]
+; CHECK-NEXT:    [[OR012:%.*]] = or i64 [[OR01]], [[SH2]]
+; CHECK-NEXT:    [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]]
+; CHECK-NEXT:    [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]]
+; CHECK-NEXT:    [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]]
+; CHECK-NEXT:    [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]]
+; CHECK-NEXT:    [[OR01234567:%.*]] = or i64 [[OR0123456]], [[SH7]]
+; CHECK-NEXT:    ret i64 [[OR01234567]]
  ;
    %g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0
    %g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1
@@ -168,30 +194,36 @@ define i64 @load64le(i8* %arg) {
  ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6
  ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7
  ; CHECK-NEXT:    [[LD0:%.*]] = load i8, i8* [[ARG]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[G1]] to <4 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, i8* [[G1]], align 1
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, i8* [[G2]], align 1
+; CHECK-NEXT:    [[LD3:%.*]] = load i8, i8* [[G3]], align 1
+; CHECK-NEXT:    [[LD4:%.*]] = load i8, i8* [[G4]], align 1
  ; CHECK-NEXT:    [[LD5:%.*]] = load i8, i8* [[G5]], align 1
  ; CHECK-NEXT:    [[LD6:%.*]] = load i8, i8* [[G6]], align 1
  ; CHECK-NEXT:    [[LD7:%.*]] = load i8, i8* [[G7]], align 1
  ; CHECK-NEXT:    [[Z0:%.*]] = zext i8 [[LD0]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[Z1:%.*]] = zext i8 [[LD1]] to i64
+; CHECK-NEXT:    [[Z2:%.*]] = zext i8 [[LD2]] to i64
+; CHECK-NEXT:    [[Z3:%.*]] = zext i8 [[LD3]] to i64
+; CHECK-NEXT:    [[Z4:%.*]] = zext i8 [[LD4]] to i64
  ; CHECK-NEXT:    [[Z5:%.*]] = zext i8 [[LD5]] to i64
  ; CHECK-NEXT:    [[Z6:%.*]] = zext i8 [[LD6]] to i64
  ; CHECK-NEXT:    [[Z7:%.*]] = zext i8 [[LD7]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw <4 x i64> [[TMP3]], <i64 8, i64 16, i64 24, i64 32>
+; CHECK-NEXT:    [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8
+; CHECK-NEXT:    [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16
+; CHECK-NEXT:    [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24
+; CHECK-NEXT:    [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32
  ; CHECK-NEXT:    [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40
  ; CHECK-NEXT:    [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48
  ; CHECK-NEXT:    [[S7:%.*]] = shl nuw i64 [[Z7]], 56
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP5]], [[S5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i64 [[TMP6]], [[S6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or i64 [[TMP7]], [[S7]]
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z0]]
-; CHECK-NEXT:    ret i64 [[OP_EXTRA]]
+; CHECK-NEXT:    [[O1:%.*]] = or i64 [[S1]], [[Z0]]
+; CHECK-NEXT:    [[O2:%.*]] = or i64 [[O1]], [[S2]]
+; CHECK-NEXT:    [[O3:%.*]] = or i64 [[O2]], [[S3]]
+; CHECK-NEXT:    [[O4:%.*]] = or i64 [[O3]], [[S4]]
+; CHECK-NEXT:    [[O5:%.*]] = or i64 [[O4]], [[S5]]
+; CHECK-NEXT:    [[O6:%.*]] = or i64 [[O5]], [[S6]]
+; CHECK-NEXT:    [[O7:%.*]] = or i64 [[O6]], [[S7]]
+; CHECK-NEXT:    ret i64 [[O7]]
  ;
    %g1 = getelementptr inbounds i8, i8* %arg, i64 1
    %g2 = getelementptr inbounds i8, i8* %arg, i64 2
@@ -247,18 +279,38 @@ define i64 @load64le_nop_shift(i8* %arg) {
  ; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 5
  ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6
  ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[ARG]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56>
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    ret i64 [[TMP5]]
+; CHECK-NEXT:    [[LD0:%.*]] = load i8, i8* [[ARG]], align 1
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, i8* [[G1]], align 1
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, i8* [[G2]], align 1
+; CHECK-NEXT:    [[LD3:%.*]] = load i8, i8* [[G3]], align 1
+; CHECK-NEXT:    [[LD4:%.*]] = load i8, i8* [[G4]], align 1
+; CHECK-NEXT:    [[LD5:%.*]] = load i8, i8* [[G5]], align 1
+; CHECK-NEXT:    [[LD6:%.*]] = load i8, i8* [[G6]], align 1
+; CHECK-NEXT:    [[LD7:%.*]] = load i8, i8* [[G7]], align 1
+; CHECK-NEXT:    [[Z0:%.*]] = zext i8 [[LD0]] to i64
+; CHECK-NEXT:    [[Z1:%.*]] = zext i8 [[LD1]] to i64
+; CHECK-NEXT:    [[Z2:%.*]] = zext i8 [[LD2]] to i64
+; CHECK-NEXT:    [[Z3:%.*]] = zext i8 [[LD3]] to i64
+; CHECK-NEXT:    [[Z4:%.*]] = zext i8 [[LD4]] to i64
+; CHECK-NEXT:    [[Z5:%.*]] = zext i8 [[LD5]] to i64
+; CHECK-NEXT:    [[Z6:%.*]] = zext i8 [[LD6]] to i64
+; CHECK-NEXT:    [[Z7:%.*]] = zext i8 [[LD7]] to i64
+; CHECK-NEXT:    [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 0
+; CHECK-NEXT:    [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8
+; CHECK-NEXT:    [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16
+; CHECK-NEXT:    [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24
+; CHECK-NEXT:    [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32
+; CHECK-NEXT:    [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40
+; CHECK-NEXT:    [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48
+; CHECK-NEXT:    [[S7:%.*]] = shl nuw i64 [[Z7]], 56
+; CHECK-NEXT:    [[O1:%.*]] = or i64 [[S1]], [[S0]]
+; CHECK-NEXT:    [[O2:%.*]] = or i64 [[O1]], [[S2]]
+; CHECK-NEXT:    [[O3:%.*]] = or i64 [[O2]], [[S3]]
+; CHECK-NEXT:    [[O4:%.*]] = or i64 [[O3]], [[S4]]
+; CHECK-NEXT:    [[O5:%.*]] = or i64 [[O4]], [[S5]]
+; CHECK-NEXT:    [[O6:%.*]] = or i64 [[O5]], [[S6]]
+; CHECK-NEXT:    [[O7:%.*]] = or i64 [[O6]], [[S7]]
+; CHECK-NEXT:    ret i64 [[O7]]
  ;
    %g1 = getelementptr inbounds i8, i8* %arg, i64 1
    %g2 = getelementptr inbounds i8, i8* %arg, i64 2
author	Sanjay Patel <spatel@rotateright.com>
	Sat, 5 Oct 2019 18:03:58 +0000 (18:03 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Sat, 5 Oct 2019 18:03:58 +0000 (18:03 +0000)
include/llvm/Analysis/TargetTransformInfo.h		patch \| blob \| history
lib/Analysis/TargetTransformInfo.cpp		patch \| blob \| history
lib/Transforms/Vectorize/SLPVectorizer.cpp		patch \| blob \| history
test/Transforms/SLPVectorizer/X86/bad-reduction.ll		patch \| blob \| history