""""""""""
The arguments (%a and %b) and the result may be of integer types of any bit
-width, but they must have the same bit width. ``%a`` and ``%b`` are the two
+width, but they must have the same bit width. The arguments may also work with
+int vectors of the same length and int size. ``%a`` and ``%b`` are the two
values that will undergo signed fixed point multiplication. The argument
``%scale`` represents the scale of both operands, and must be a constant
integer.
value is rounded up or down to the closest representable value. The rounding
direction is unspecified.
-It is undefined behavior if the source value does not fit within the range of
+It is undefined behavior if the result value does not fit within the range of
the fixed point type.
%res = call i4 @llvm.smul.fix.i4(i4 3, i4 -3, i32 1) ; %res = -5 (or -4) (1.5 x -1.5 = -2.25)
+'``llvm.umul.fix.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.umul.fix``
+on any integer bit width or vectors of integers.
+
+::
+
+ declare i16 @llvm.umul.fix.i16(i16 %a, i16 %b, i32 %scale)
+ declare i32 @llvm.umul.fix.i32(i32 %a, i32 %b, i32 %scale)
+ declare i64 @llvm.umul.fix.i64(i64 %a, i64 %b, i32 %scale)
+ declare <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> %a, <4 x i32> %b, i32 %scale)
+
+Overview
+"""""""""
+
+The '``llvm.umul.fix``' family of intrinsic functions perform unsigned
+fixed point multiplication on 2 arguments of the same scale.
+
+Arguments
+""""""""""
+
+The arguments (%a and %b) and the result may be of integer types of any bit
+width, but they must have the same bit width. The arguments may also work with
+int vectors of the same length and int size. ``%a`` and ``%b`` are the two
+values that will undergo unsigned fixed point multiplication. The argument
+``%scale`` represents the scale of both operands, and must be a constant
+integer.
+
+Semantics:
+""""""""""
+
+This operation performs unsigned fixed point multiplication on the 2 arguments of a
+specified scale. The result will also be returned in the same scale specified
+in the third argument.
+
+If the result value cannot be precisely represented in the given scale, the
+value is rounded up or down to the closest representable value. The rounding
+direction is unspecified.
+
+It is undefined behavior if the result value does not fit within the range of
+the fixed point type.
+
+
+Examples
+"""""""""
+
+.. code-block:: llvm
+
+ %res = call i4 @llvm.umul.fix.i4(i4 3, i4 2, i32 0) ; %res = 6 (2 x 3 = 6)
+ %res = call i4 @llvm.umul.fix.i4(i4 3, i4 2, i32 1) ; %res = 3 (1.5 x 1 = 1.5)
+
+ ; The result in the following could be rounded down to 3.5 or up to 4
+ %res = call i4 @llvm.umul.fix.i4(i4 15, i4 1, i32 1) ; %res = 7 (or 8) (7.5 x 0.5 = 3.75)
+
+
Specialised Arithmetic Intrinsics
---------------------------------
/// resulting value is this minimum value.
SSUBSAT, USUBSAT,
- /// RESULT = SMULFIX(LHS, RHS, SCALE) - Perform fixed point multiplication on
+ /// RESULT = [US]MULFIX(LHS, RHS, SCALE) - Perform fixed point multiplication on
/// 2 integers with the same width and scale. SCALE represents the scale of
/// both operands as fixed point numbers. This SCALE parameter must be a
/// constant integer. A scale of zero is effectively performing
/// multiplication on 2 integers.
- SMULFIX,
+ SMULFIX, UMULFIX,
/// Simple binary floating point operators.
FADD, FSUB, FMUL, FDIV, FREM,
default:
llvm_unreachable("Unexpected fixed point operation.");
case ISD::SMULFIX:
+ case ISD::UMULFIX:
Supported = isSupportedFixedPointOperation(Op, VT, Scale);
break;
}
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable, Commutative]>;
+def int_umul_fix : Intrinsic<[llvm_anyint_ty],
+ [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
+ [IntrNoMem, IntrSpeculatable, Commutative]>;
+
//===------------------------- Memory Use Markers -------------------------===//
//
def int_lifetime_start : Intrinsic<[],
def SDTIntBinHiLoOp : SDTypeProfile<2, 2, [ // mulhi, mullo, sdivrem, udivrem
SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,SDTCisInt<0>
]>;
-def SDTIntScaledBinOp : SDTypeProfile<1, 3, [ // smulfix
+def SDTIntScaledBinOp : SDTypeProfile<1, 3, [ // smulfix, umulfix
SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
]>;
def uaddsat : SDNode<"ISD::UADDSAT" , SDTIntBinOp, [SDNPCommutative]>;
def ssubsat : SDNode<"ISD::SSUBSAT" , SDTIntBinOp>;
def usubsat : SDNode<"ISD::USUBSAT" , SDTIntBinOp>;
+
def smulfix : SDNode<"ISD::SMULFIX" , SDTIntScaledBinOp, [SDNPCommutative]>;
+def umulfix : SDNode<"ISD::UMULFIX" , SDTIntScaledBinOp, [SDNPCommutative]>;
def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>;
Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
break;
}
- case ISD::SMULFIX: {
+ case ISD::SMULFIX:
+ case ISD::UMULFIX: {
unsigned Scale = Node->getConstantOperandVal(2);
Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
Node->getValueType(0), Scale);
Results.push_back(TLI.expandAddSubSat(Node, DAG));
break;
case ISD::SMULFIX:
+ case ISD::UMULFIX:
Results.push_back(TLI.expandFixedPointMul(Node, DAG));
break;
case ISD::SADDO:
case ISD::UADDSAT:
case ISD::SSUBSAT:
case ISD::USUBSAT: Res = PromoteIntRes_ADDSUBSAT(N); break;
- case ISD::SMULFIX: Res = PromoteIntRes_SMULFIX(N); break;
+ case ISD::SMULFIX:
+ case ISD::UMULFIX: Res = PromoteIntRes_MULFIX(N); break;
case ISD::ATOMIC_LOAD:
Res = PromoteIntRes_Atomic0(cast<AtomicSDNode>(N)); break;
return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
}
-SDValue DAGTypeLegalizer::PromoteIntRes_SMULFIX(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_MULFIX(SDNode *N) {
// Can just promote the operands then continue with operation.
SDLoc dl(N);
- SDValue Op1Promoted = SExtPromotedInteger(N->getOperand(0));
- SDValue Op2Promoted = SExtPromotedInteger(N->getOperand(1));
+ SDValue Op1Promoted, Op2Promoted;
+ if (N->getOpcode() == ISD::SMULFIX) {
+ Op1Promoted = SExtPromotedInteger(N->getOperand(0));
+ Op2Promoted = SExtPromotedInteger(N->getOperand(1));
+ } else {
+ Op1Promoted = ZExtPromotedInteger(N->getOperand(0));
+ Op2Promoted = ZExtPromotedInteger(N->getOperand(1));
+ }
EVT PromotedType = Op1Promoted.getValueType();
return DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted, Op2Promoted,
N->getOperand(2));
case ISD::PREFETCH: Res = PromoteIntOp_PREFETCH(N, OpNo); break;
- case ISD::SMULFIX: Res = PromoteIntOp_SMULFIX(N); break;
+ case ISD::SMULFIX:
+ case ISD::UMULFIX: Res = PromoteIntOp_MULFIX(N); break;
case ISD::FPOWI: Res = PromoteIntOp_FPOWI(N); break;
}
return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, Carry), 0);
}
-SDValue DAGTypeLegalizer::PromoteIntOp_SMULFIX(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntOp_MULFIX(SDNode *N) {
SDValue Op2 = ZExtPromotedInteger(N->getOperand(2));
return SDValue(
DAG.UpdateNodeOperands(N, N->getOperand(0), N->getOperand(1), Op2), 0);
case ISD::UADDSAT:
case ISD::SSUBSAT:
case ISD::USUBSAT: ExpandIntRes_ADDSUBSAT(N, Lo, Hi); break;
- case ISD::SMULFIX: ExpandIntRes_SMULFIX(N, Lo, Hi); break;
+ case ISD::SMULFIX:
+ case ISD::UMULFIX: ExpandIntRes_MULFIX(N, Lo, Hi); break;
}
// If Lo/Hi is null, the sub-method took care of registering results etc.
SplitInteger(Result, Lo, Hi);
}
-void DAGTypeLegalizer::ExpandIntRes_SMULFIX(SDNode *N, SDValue &Lo,
- SDValue &Hi) {
+void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
+ SDValue &Hi) {
+ assert(
+ (N->getOpcode() == ISD::SMULFIX || N->getOpcode() == ISD::UMULFIX) &&
+ "Expected operand to be signed or unsigned fixed point multiplication");
+
SDLoc dl(N);
EVT VT = N->getValueType(0);
SDValue LHS = N->getOperand(0);
GetExpandedInteger(RHS, RL, RH);
SmallVector<SDValue, 4> Result;
- if (!TLI.expandMUL_LOHI(ISD::SMUL_LOHI, VT, dl, LHS, RHS, Result, NVT, DAG,
+ bool Signed = N->getOpcode() == ISD::SMULFIX;
+ unsigned LoHiOp = Signed ? ISD::SMUL_LOHI : ISD::UMUL_LOHI;
+ if (!TLI.expandMUL_LOHI(LoHiOp, VT, dl, LHS, RHS, Result, NVT, DAG,
TargetLowering::MulExpansionKind::OnlyLegalOrCustom,
LL, LH, RL, RH)) {
- report_fatal_error("Unable to expand SMUL_FIX using SMUL_LOHI.");
+ report_fatal_error("Unable to expand MUL_FIX using MUL_LOHI.");
return;
}
Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultHL, SRLAmnt);
Hi = DAG.getNode(ISD::OR, dl, NVT, Hi,
DAG.getNode(ISD::SHL, dl, NVT, ResultHH, SHLAmnt));
+ } else if (Scale == VTSize) {
+ assert(
+ !Signed &&
+ "Only unsigned types can have a scale equal to the operand bit width");
+
+ Lo = ResultHL;
+ Hi = ResultHH;
} else {
- llvm_unreachable(
- "Expected the scale to be less than the width of the operands");
+ llvm_unreachable("Expected the scale to be less than or equal to the width "
+ "of the operands");
}
}
SDValue PromoteIntRes_VAARG(SDNode *N);
SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
SDValue PromoteIntRes_ADDSUBSAT(SDNode *N);
- SDValue PromoteIntRes_SMULFIX(SDNode *N);
+ SDValue PromoteIntRes_MULFIX(SDNode *N);
SDValue PromoteIntRes_FLT_ROUNDS(SDNode *N);
// Integer Operand Promotion.
SDValue PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_FRAMERETURNADDR(SDNode *N);
SDValue PromoteIntOp_PREFETCH(SDNode *N, unsigned OpNo);
- SDValue PromoteIntOp_SMULFIX(SDNode *N);
+ SDValue PromoteIntOp_MULFIX(SDNode *N);
SDValue PromoteIntOp_FPOWI(SDNode *N);
void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
void ExpandIntRes_UADDSUBO (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_XMULO (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_ADDSUBSAT (SDNode *N, SDValue &Lo, SDValue &Hi);
- void ExpandIntRes_SMULFIX (SDNode *N, SDValue &Lo, SDValue &Hi);
+ void ExpandIntRes_MULFIX (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_ATOMIC_LOAD (SDNode *N, SDValue &Lo, SDValue &Hi);
SDValue ScalarizeVecRes_UNDEF(SDNode *N);
SDValue ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N);
- SDValue ScalarizeVecRes_SMULFIX(SDNode *N);
+ SDValue ScalarizeVecRes_MULFIX(SDNode *N);
// Vector Operand Scalarization: <1 x ty> -> ty.
bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo);
void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo, SDValue &Hi);
- void SplitVecRes_SMULFIX(SDNode *N, SDValue &Lo, SDValue &Hi);
+ void SplitVecRes_MULFIX(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
case ISD::USUBSAT:
Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
break;
- case ISD::SMULFIX: {
+ case ISD::SMULFIX:
+ case ISD::UMULFIX: {
unsigned Scale = Node->getConstantOperandVal(2);
Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
Node->getValueType(0), Scale);
case ISD::SADDSAT:
return ExpandAddSubSat(Op);
case ISD::SMULFIX:
+ case ISD::UMULFIX:
return ExpandFixedPointMul(Op);
case ISD::STRICT_FADD:
case ISD::STRICT_FSUB:
R = ScalarizeVecRes_StrictFPOp(N);
break;
case ISD::SMULFIX:
- R = ScalarizeVecRes_SMULFIX(N);
+ case ISD::UMULFIX:
+ R = ScalarizeVecRes_MULFIX(N);
break;
}
Op0.getValueType(), Op0, Op1, Op2);
}
-SDValue DAGTypeLegalizer::ScalarizeVecRes_SMULFIX(SDNode *N) {
+SDValue DAGTypeLegalizer::ScalarizeVecRes_MULFIX(SDNode *N) {
SDValue Op0 = GetScalarizedVector(N->getOperand(0));
SDValue Op1 = GetScalarizedVector(N->getOperand(1));
SDValue Op2 = N->getOperand(2);
SplitVecRes_StrictFPOp(N, Lo, Hi);
break;
case ISD::SMULFIX:
- SplitVecRes_SMULFIX(N, Lo, Hi);
+ case ISD::UMULFIX:
+ SplitVecRes_MULFIX(N, Lo, Hi);
break;
}
Op0Hi, Op1Hi, Op2Hi);
}
-void DAGTypeLegalizer::SplitVecRes_SMULFIX(SDNode *N, SDValue &Lo,
- SDValue &Hi) {
+void DAGTypeLegalizer::SplitVecRes_MULFIX(SDNode *N, SDValue &Lo, SDValue &Hi) {
SDValue LHSLo, LHSHi;
GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
SDValue RHSLo, RHSHi;
# define setjmp_undefined_for_msvc
#endif
+static unsigned FixedPointIntrinsicToOpcode(unsigned Intrinsic) {
+ switch (Intrinsic) {
+ case Intrinsic::smul_fix:
+ return ISD::SMULFIX;
+ case Intrinsic::umul_fix:
+ return ISD::UMULFIX;
+ default:
+ llvm_unreachable("Unhandled fixed point intrinsic");
+ }
+}
+
/// Lower the call to the specified intrinsic function. If we want to emit this
/// as a call to a named external function, return the name. Otherwise, lower it
/// and return null.
setValue(&I, DAG.getNode(ISD::USUBSAT, sdl, Op1.getValueType(), Op1, Op2));
return nullptr;
}
- case Intrinsic::smul_fix: {
+ case Intrinsic::smul_fix:
+ case Intrinsic::umul_fix: {
SDValue Op1 = getValue(I.getArgOperand(0));
SDValue Op2 = getValue(I.getArgOperand(1));
SDValue Op3 = getValue(I.getArgOperand(2));
- setValue(&I,
- DAG.getNode(ISD::SMULFIX, sdl, Op1.getValueType(), Op1, Op2, Op3));
+ setValue(&I, DAG.getNode(FixedPointIntrinsicToOpcode(Intrinsic), sdl,
+ Op1.getValueType(), Op1, Op2, Op3));
return nullptr;
}
case Intrinsic::stacksave: {
case ISD::SSUBSAT: return "ssubsat";
case ISD::USUBSAT: return "usubsat";
case ISD::SMULFIX: return "smulfix";
+ case ISD::UMULFIX: return "umulfix";
// Conversion operators.
case ISD::SIGN_EXTEND: return "sign_extend";
SDValue
TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
- assert(Node->getOpcode() == ISD::SMULFIX && "Expected opcode to be SMULFIX.");
+ assert((Node->getOpcode() == ISD::SMULFIX ||
+ Node->getOpcode() == ISD::UMULFIX) &&
+ "Expected opcode to be SMULFIX or UMULFIX.");
SDLoc dl(Node);
SDValue LHS = Node->getOperand(0);
return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
}
+ unsigned VTSize = VT.getScalarSizeInBits();
+ bool Signed = Node->getOpcode() == ISD::SMULFIX;
+
+ assert(((Signed && Scale < VTSize) || (!Signed && Scale <= VTSize)) &&
+ "Expected scale to be less than the number of bits if signed or at "
+ "most the number of bits if unsigned.");
assert(LHS.getValueType() == RHS.getValueType() &&
"Expected both operands to be the same type");
- assert(Scale < VT.getScalarSizeInBits() &&
- "Expected scale to be less than the number of bits.");
// Get the upper and lower bits of the result.
SDValue Lo, Hi;
- if (isOperationLegalOrCustom(ISD::SMUL_LOHI, VT)) {
- SDValue Result =
- DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT), LHS, RHS);
+ unsigned LoHiOp = Signed ? ISD::SMUL_LOHI : ISD::UMUL_LOHI;
+ unsigned HiOp = Signed ? ISD::MULHS : ISD::MULHU;
+ if (isOperationLegalOrCustom(LoHiOp, VT)) {
+ SDValue Result = DAG.getNode(LoHiOp, dl, DAG.getVTList(VT, VT), LHS, RHS);
Lo = Result.getValue(0);
Hi = Result.getValue(1);
- } else if (isOperationLegalOrCustom(ISD::MULHS, VT)) {
+ } else if (isOperationLegalOrCustom(HiOp, VT)) {
Lo = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
- Hi = DAG.getNode(ISD::MULHS, dl, VT, LHS, RHS);
+ Hi = DAG.getNode(HiOp, dl, VT, LHS, RHS);
} else if (VT.isVector()) {
return SDValue();
} else {
- report_fatal_error("Unable to expand signed fixed point multiplication.");
+ report_fatal_error("Unable to expand fixed point multiplication.");
}
+ if (Scale == VTSize)
+ // Result is just the top half since we'd be shifting by the width of the
+ // operand.
+ return Hi;
+
// The result will need to be shifted right by the scale since both operands
// are scaled. The result is given to us in 2 halves, so we only want part of
// both in the result.
setOperationAction(ISD::SSUBSAT, VT, Expand);
setOperationAction(ISD::USUBSAT, VT, Expand);
setOperationAction(ISD::SMULFIX, VT, Expand);
+ setOperationAction(ISD::UMULFIX, VT, Expand);
// Overflow operations default to expand
setOperationAction(ISD::SADDO, VT, Expand);
"of ints");
break;
}
- case Intrinsic::smul_fix: {
+ case Intrinsic::smul_fix:
+ case Intrinsic::umul_fix: {
Value *Op1 = Call.getArgOperand(0);
Value *Op2 = Call.getArgOperand(1);
Assert(Op1->getType()->isIntOrIntVectorTy(),
- "first operand of smul_fix must be an int type or vector "
+ "first operand of [us]mul_fix must be an int type or vector "
"of ints");
Assert(Op2->getType()->isIntOrIntVectorTy(),
- "second operand of smul_fix must be an int type or vector "
+ "second operand of [us]mul_fix must be an int type or vector "
"of ints");
auto *Op3 = dyn_cast<ConstantInt>(Call.getArgOperand(2));
- Assert(Op3, "third argument of smul_fix must be a constant integer");
+ Assert(Op3, "third argument of [us]mul_fix must be a constant integer");
Assert(Op3->getType()->getBitWidth() <= 32,
- "third argument of smul_fix must fit within 32 bits");
- Assert(Op3->getZExtValue() < Op1->getType()->getScalarSizeInBits(),
- "the scale of smul_fix must be less than the width of the operands");
+ "third argument of [us]mul_fix must fit within 32 bits");
+
+ if (ID == Intrinsic::smul_fix) {
+ Assert(
+ Op3->getZExtValue() < Op1->getType()->getScalarSizeInBits(),
+ "the scale of smul_fix must be less than the width of the operands");
+ } else {
+ Assert(Op3->getZExtValue() <= Op1->getType()->getScalarSizeInBits(),
+ "the scale of umul_fix must be less than or equal to the width of "
+ "the operands");
+ }
break;
}
};
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+
+declare i4 @llvm.umul.fix.i4 (i4, i4, i32)
+declare i32 @llvm.umul.fix.i32 (i32, i32, i32)
+declare i64 @llvm.umul.fix.i64 (i64, i64, i32)
+declare <4 x i32> @llvm.umul.fix.v4i32(<4 x i32>, <4 x i32>, i32)
+
+define i32 @func(i32 %x, i32 %y) nounwind {
+; X64-LABEL: func:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: imulq %rax, %rcx
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: shldl $30, %ecx, %eax
+; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: retq
+;
+; X86-LABEL: func:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: shrdl $2, %edx, %eax
+; X86-NEXT: retl
+ %tmp = call i32 @llvm.umul.fix.i32(i32 %x, i32 %y, i32 2);
+ ret i32 %tmp;
+}
+
+define i64 @func2(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func2:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: shrdq $2, %rdx, %rax
+; X64-NEXT: retq
+;
+; X86-LABEL: func2:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: shldl $30, %eax, %edx
+; X86-NEXT: shldl $30, %ecx, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+ %tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 2);
+ ret i64 %tmp;
+}
+
+define i4 @func3(i4 %x, i4 %y) nounwind {
+; X64-LABEL: func3:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andl $15, %esi
+; X64-NEXT: andl $15, %eax
+; X64-NEXT: imull %esi, %eax
+; X64-NEXT: shrb $2, %al
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; X86-LABEL: func3:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %al
+; X86-NEXT: andb $15, %al
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: andb $15, %cl
+; X86-NEXT: movzbl %cl, %ecx
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: imull %ecx, %eax
+; X86-NEXT: shrb $2, %al
+; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: retl
+ %tmp = call i4 @llvm.umul.fix.i4(i4 %x, i4 %y, i32 2);
+ ret i4 %tmp;
+}
+
+define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
+; X64-LABEL: vec:
+; X64: # %bb.0:
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X64-NEXT: pmuludq %xmm1, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X64-NEXT: pmuludq %xmm2, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
+; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X64-NEXT: pslld $30, %xmm3
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: psrld $2, %xmm0
+; X64-NEXT: por %xmm3, %xmm0
+; X64-NEXT: retq
+;
+; X86-LABEL: vec:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: shldl $30, %eax, %ebp
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: shldl $30, %eax, %ebx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: shldl $30, %eax, %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: shldl $30, %eax, %edx
+; X86-NEXT: movl %edx, 12(%ecx)
+; X86-NEXT: movl %edi, 8(%ecx)
+; X86-NEXT: movl %ebx, 4(%ecx)
+; X86-NEXT: movl %ebp, (%ecx)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+ %tmp = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> %x, <4 x i32> %y, i32 2);
+ ret <4 x i32> %tmp;
+}
+
+; These result in regular integer multiplication
+define i32 @func4(i32 %x, i32 %y) nounwind {
+; X64-LABEL: func4:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: imull %esi, %eax
+; X64-NEXT: retq
+;
+; X86-LABEL: func4:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: imull {{[0-9]+}}(%esp), %eax
+; X86-NEXT: retl
+ %tmp = call i32 @llvm.umul.fix.i32(i32 %x, i32 %y, i32 0);
+ ret i32 %tmp;
+}
+
+define i64 @func5(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func5:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: imulq %rsi, %rax
+; X64-NEXT: retq
+;
+; X86-LABEL: func5:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+ %tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 0);
+ ret i64 %tmp;
+}
+
+define i4 @func6(i4 %x, i4 %y) nounwind {
+; X64-LABEL: func6:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: andb $15, %al
+; X64-NEXT: andb $15, %sil
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: mulb %sil
+; X64-NEXT: retq
+;
+; X86-LABEL: func6:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %al
+; X86-NEXT: andb $15, %al
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: andb $15, %cl
+; X86-NEXT: mulb %cl
+; X86-NEXT: retl
+ %tmp = call i4 @llvm.umul.fix.i4(i4 %x, i4 %y, i32 0);
+ ret i4 %tmp;
+}
+
+define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
+; X64-LABEL: vec2:
+; X64: # %bb.0:
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X64-NEXT: pmuludq %xmm1, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X64-NEXT: pmuludq %xmm2, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: retq
+;
+; X86-LABEL: vec2:
+; X86: # %bb.0:
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT: imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT: imull {{[0-9]+}}(%esp), %edx
+; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %esi, 4(%eax)
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl $4
+ %tmp = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> %x, <4 x i32> %y, i32 0);
+ ret <4 x i32> %tmp;
+}
+
+define i64 @func7(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func7:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: shrdq $32, %rdx, %rax
+; X64-NEXT: retq
+;
+; X86-LABEL: func7:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+ %tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 32);
+ ret i64 %tmp;
+}
+
+define i64 @func8(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func8:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: shrdq $63, %rdx, %rax
+; X64-NEXT: retq
+;
+; X86-LABEL: func8:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl %ebp, %edx
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: shldl $1, %edx, %ebx
+; X86-NEXT: shrdl $31, %edx, %eax
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+ %tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 63);
+ ret i64 %tmp;
+}
+
+define i64 @func9(i64 %x, i64 %y) nounwind {
+; X64-LABEL: func9:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rdx, %rax
+; X64-NEXT: retq
+;
+; X86-LABEL: func9:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: addl %edx, %ebp
+; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+ %tmp = call i64 @llvm.umul.fix.i64(i64 %x, i64 %y, i32 64);
+ ret i64 %tmp;
+}