SDValue &Overflow, SelectionDAG &DAG) const {
SDLoc dl(Node);
EVT VT = Node->getValueType(0);
+ EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+ SDValue LHS = Node->getOperand(0);
+ SDValue RHS = Node->getOperand(1);
+ bool isSigned = Node->getOpcode() == ISD::SMULO;
+
+ // For power-of-two multiplications we can use a simpler shift expansion.
+ if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
+ const APInt &C = RHSC->getAPIntValue();
+ // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
+ if (C.isPowerOf2()) {
+ // smulo(x, signed_min) is same as umulo(x, signed_min).
+ bool UseArithShift = isSigned && !C.isMinSignedValue();
+ EVT ShiftAmtTy = getShiftAmountTy(VT, DAG.getDataLayout());
+ SDValue ShiftAmt = DAG.getConstant(C.logBase2(), dl, ShiftAmtTy);
+ Result = DAG.getNode(ISD::SHL, dl, VT, LHS, ShiftAmt);
+ Overflow = DAG.getSetCC(dl, SetCCVT,
+ DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
+ dl, VT, Result, ShiftAmt),
+ LHS, ISD::SETNE);
+ return true;
+ }
+ }
+
EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getScalarSizeInBits() * 2);
if (VT.isVector())
WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT,
VT.getVectorNumElements());
- SDValue LHS = Node->getOperand(0);
- SDValue RHS = Node->getOperand(1);
SDValue BottomHalf;
SDValue TopHalf;
static const unsigned Ops[2][3] =
{ { ISD::MULHU, ISD::UMUL_LOHI, ISD::ZERO_EXTEND },
{ ISD::MULHS, ISD::SMUL_LOHI, ISD::SIGN_EXTEND }};
- bool isSigned = Node->getOpcode() == ISD::SMULO;
if (isOperationLegalOrCustom(Ops[isSigned][0], VT)) {
BottomHalf = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
TopHalf = DAG.getNode(Ops[isSigned][0], dl, VT, LHS, RHS);
}
}
- EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
Result = BottomHalf;
if (isSigned) {
SDValue ShiftAmt = DAG.getConstant(
define <4 x i32> @umul_v4i32_1(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX-LABEL: umul_v4i32_1:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
-; AVX-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%x = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
%y = extractvalue { <4 x i32>, <4 x i1> } %x, 0
define <4 x i32> @umul_v4i32_8(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX-LABEL: umul_v4i32_8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8]
-; AVX-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm2
; AVX-NEXT: vpslld $3, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
define <4 x i32> @umul_v4i32_2pow31(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX-LABEL: umul_v4i32_2pow31:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm2
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
define <4 x i32> @smul_v4i32_1(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX-LABEL: smul_v4i32_1:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
-; AVX-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; AVX-NEXT: vpsrad $31, %xmm0, %xmm3
-; AVX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%x = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
%y = extractvalue { <4 x i32>, <4 x i1> } %x, 0
define <4 x i32> @smul_v4i32_8(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX-LABEL: smul_v4i32_8:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8]
-; AVX-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; AVX-NEXT: vpslld $3, %xmm0, %xmm0
-; AVX-NEXT: vpsrad $31, %xmm0, %xmm3
-; AVX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpslld $3, %xmm0, %xmm2
+; AVX-NEXT: vpsrad $3, %xmm2, %xmm3
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX-NEXT: retq
%x = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> %a, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
%y = extractvalue { <4 x i32>, <4 x i1> } %x, 0
define <4 x i32> @smul_v4i32_2pow31(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX-LABEL: smul_v4i32_2pow31:
; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX-NEXT: vpmuldq %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpmuldq %xmm3, %xmm0, %xmm3
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm2
+; AVX-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm2
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX-NEXT: vpsrad $31, %xmm0, %xmm3
-; AVX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%x = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> %a, <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>)