// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
// which is A horizontal-op B.
- // At least one of the operands should be a vector shuffle.
- if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
- RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
- return false;
-
MVT VT = LHS.getSimpleValueType();
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for horizontal add/sub");
+ unsigned NumElts = VT.getVectorNumElements();
+
+ auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
+ SmallVectorImpl<int> &ShuffleMask) {
+ if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ if (!Op.getOperand(0).isUndef())
+ N0 = Op.getOperand(0);
+ if (!Op.getOperand(1).isUndef())
+ N1 = Op.getOperand(1);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
+ ShuffleMask.append(Mask.begin(), Mask.end());
+ return;
+ }
+ bool IsUnary;
+ SmallVector<SDValue, 2> SrcOps;
+ SmallVector<int, 16> SrcShuffleMask;
+ SDValue BC = peekThroughBitcasts(Op);
+ if (isTargetShuffle(BC.getOpcode()) &&
+ getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
+ SrcOps, SrcShuffleMask, IsUnary) &&
+ SrcOps.size() <= 2 && SrcShuffleMask.size() == NumElts) {
+ N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
+ N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
+ ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
+ }
+ };
// View LHS in the form
// LHS = VECTOR_SHUFFLE A, B, LMask
// If LHS is not a shuffle, then pretend it is the identity shuffle:
// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
// NOTE: A default initialized SDValue represents an UNDEF of type VT.
- unsigned NumElts = VT.getVectorNumElements();
SDValue A, B;
- SmallVector<int, 16> LMask(NumElts);
- if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
- if (!LHS.getOperand(0).isUndef())
- A = LHS.getOperand(0);
- if (!LHS.getOperand(1).isUndef())
- B = LHS.getOperand(1);
- ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS)->getMask();
- llvm::copy(Mask, LMask.begin());
- } else {
- A = LHS;
- for (unsigned i = 0; i != NumElts; ++i)
- LMask[i] = i;
- }
+ SmallVector<int, 16> LMask;
+ GetShuffle(LHS, A, B, LMask);
// Likewise, view RHS in the form
// RHS = VECTOR_SHUFFLE C, D, RMask
SDValue C, D;
- SmallVector<int, 16> RMask(NumElts);
- if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
- if (!RHS.getOperand(0).isUndef())
- C = RHS.getOperand(0);
- if (!RHS.getOperand(1).isUndef())
- D = RHS.getOperand(1);
- ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS)->getMask();
- llvm::copy(Mask, RMask.begin());
- } else {
+ SmallVector<int, 16> RMask;
+ GetShuffle(RHS, C, D, RMask);
+
+ // At least one of the operands should be a vector shuffle.
+ if (LMask.empty() && RMask.empty())
+ return false;
+
+ if (LMask.empty()) {
+ A = LHS;
+ for (unsigned i = 0; i != NumElts; ++i)
+ LMask.push_back(i);
+ }
+
+ if (RMask.empty()) {
C = RHS;
for (unsigned i = 0; i != NumElts; ++i)
- RMask[i] = i;
+ RMask.push_back(i);
}
// If A and B occur in reverse order in RHS, then canonicalize by commuting
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
isHorizontalBinOp(LHS, RHS, IsFadd) &&
shouldUseHorizontalOp(LHS == RHS, DAG, Subtarget))
- return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
+ return DAG.getNode(HorizOpcode, SDLoc(N), VT, DAG.getBitcast(VT, LHS),
+ DAG.getBitcast(VT, RHS));
return SDValue();
}
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
};
+ Op0 = DAG.getBitcast(VT, Op0);
+ Op1 = DAG.getBitcast(VT, Op1);
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
HADDBuilder);
}
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
};
+ Op0 = DAG.getBitcast(VT, Op0);
+ Op1 = DAG.getBitcast(VT, Op1);
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
HSUBBuilder);
}
; AVX-SLOW-LABEL: PR39936_v8f32:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2]
-; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX-SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX-FAST-LABEL: PR39936_v8f32:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2]
-; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX-FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vzeroupper
; SSSE3-FAST-NEXT: movd %xmm0, %eax
; SSSE3-FAST-NEXT: retq
;
-; AVX-SLOW-LABEL: PR39936_v8i32:
-; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2]
-; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm2, %xmm0
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX-SLOW-NEXT: vzeroupper
-; AVX-SLOW-NEXT: retq
+; AVX1-SLOW-LABEL: PR39936_v8i32:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT: vzeroupper
+; AVX1-SLOW-NEXT: retq
;
-; AVX-FAST-LABEL: PR39936_v8i32:
-; AVX-FAST: # %bb.0:
-; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2]
-; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX-FAST-NEXT: vpaddd %xmm0, %xmm2, %xmm0
-; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-FAST-NEXT: vmovd %xmm0, %eax
-; AVX-FAST-NEXT: vzeroupper
-; AVX-FAST-NEXT: retq
+; AVX1-FAST-LABEL: PR39936_v8i32:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vmovd %xmm0, %eax
+; AVX1-FAST-NEXT: vzeroupper
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-SLOW-LABEL: PR39936_v8i32:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: PR39936_v8i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vmovd %xmm0, %eax
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
%2 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
%3 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
%4 = add <8 x i32> %2, %3
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
-; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: add_v4f64_0246_1357:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
-; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: add_v4f64_4602_5713:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
-; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vhaddps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: add_v8f32_02468ACE_13579BDF:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
-; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vhaddps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: add_v8f32_8ACE0246_9BDF1357: