unsigned Depth = 0,
bool AssumeSingleUse = false) const;
+ /// Helper wrapper around SimplifyDemandedBits
+ bool SimplifyDemandedBits(SDValue Op, APInt &DemandedMask,
+ DAGCombinerInfo &DCI) const;
+
/// Determine which of the bits specified in Mask are known to be either zero
/// or one and return them in the KnownZero/KnownOne bitsets.
virtual void computeKnownBitsForTargetNode(const SDValue Op,
return true;
}
+bool TargetLowering::SimplifyDemandedBits(SDValue Op, APInt &DemandedMask,
+ DAGCombinerInfo &DCI) const {
+
+ SelectionDAG &DAG = DCI.DAG;
+ TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ APInt KnownZero, KnownOne;
+
+ bool Simplified = SimplifyDemandedBits(Op, DemandedMask, KnownZero, KnownOne,
+ TLO);
+ if (Simplified)
+ DCI.CommitTargetLoweringOpt(TLO);
+ return Simplified;
+}
+
/// Look at Op. At this point, we know that only the DemandedMask bits of the
/// result of Op are ever used downstream. If we can use this information to
/// simplify Op, create a new simplified DAG node and return true, returning the
void SelectConcatVector(SDNode *N);
void SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI);
- bool trySMLAWSMULW(SDNode *N);
-
void SelectCMP_SWAP(SDNode *N);
/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
return false;
}
-static bool SearchSignedMulShort(SDValue SignExt, unsigned *Opc, SDValue &Src1,
- bool Accumulate) {
- // For SM*WB, we need to some form of sext.
- // For SM*WT, we need to search for (sra X, 16)
- // Src1 then gets set to X.
- if ((SignExt.getOpcode() == ISD::SIGN_EXTEND ||
- SignExt.getOpcode() == ISD::SIGN_EXTEND_INREG ||
- SignExt.getOpcode() == ISD::AssertSext) &&
- SignExt.getValueType() == MVT::i32) {
-
- *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
- Src1 = SignExt.getOperand(0);
- return true;
- }
-
- if (SignExt.getOpcode() != ISD::SRA)
- return false;
-
- ConstantSDNode *SRASrc1 = dyn_cast<ConstantSDNode>(SignExt.getOperand(1));
- if (!SRASrc1 || SRASrc1->getZExtValue() != 16)
- return false;
-
- SDValue Op0 = SignExt.getOperand(0);
-
- // The sign extend operand for SM*WB could be generated by a shl and ashr.
- if (Op0.getOpcode() == ISD::SHL) {
- SDValue SHL = Op0;
- ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
- if (!SHLSrc1 || SHLSrc1->getZExtValue() != 16)
- return false;
-
- *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
- Src1 = Op0.getOperand(0);
- return true;
- }
- *Opc = Accumulate ? ARM::SMLAWT : ARM::SMULWT;
- Src1 = SignExt.getOperand(0);
- return true;
-}
-
-static bool SearchSignedMulLong(SDValue OR, unsigned *Opc, SDValue &Src0,
- SDValue &Src1, bool Accumulate) {
- // First we look for:
- // (add (or (srl ?, 16), (shl ?, 16)))
- if (OR.getOpcode() != ISD::OR)
- return false;
-
- SDValue SRL = OR.getOperand(0);
- SDValue SHL = OR.getOperand(1);
-
- if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
- SRL = OR.getOperand(1);
- SHL = OR.getOperand(0);
- if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL)
- return false;
- }
-
- ConstantSDNode *SRLSrc1 = dyn_cast<ConstantSDNode>(SRL.getOperand(1));
- ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
- if (!SRLSrc1 || !SHLSrc1 || SRLSrc1->getZExtValue() != 16 ||
- SHLSrc1->getZExtValue() != 16)
- return false;
-
- // The first operands to the shifts need to be the two results from the
- // same smul_lohi node.
- if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
- SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
- return false;
-
- SDNode *SMULLOHI = SRL.getOperand(0).getNode();
- if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
- SHL.getOperand(0) != SDValue(SMULLOHI, 1))
- return false;
-
- // Now we have:
- // (add (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
- // For SMLAW[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
- // For SMLAWB the 16-bit value will signed extended somehow.
- // For SMLAWT only the SRA is required.
-
- // Check both sides of SMUL_LOHI
- if (SearchSignedMulShort(SMULLOHI->getOperand(0), Opc, Src1, Accumulate)) {
- Src0 = SMULLOHI->getOperand(1);
- } else if (SearchSignedMulShort(SMULLOHI->getOperand(1), Opc, Src1,
- Accumulate)) {
- Src0 = SMULLOHI->getOperand(0);
- } else {
- return false;
- }
- return true;
-}
-
-bool ARMDAGToDAGISel::trySMLAWSMULW(SDNode *N) {
- if (!Subtarget->hasV6Ops() ||
- (Subtarget->isThumb() && !Subtarget->hasThumb2()))
- return false;
-
- SDLoc dl(N);
- SDValue Src0 = N->getOperand(0);
- SDValue Src1 = N->getOperand(1);
- SDValue A, B;
- unsigned Opc = 0;
-
- if (N->getOpcode() == ISD::ADD) {
- if (Src0.getOpcode() != ISD::OR && Src1.getOpcode() != ISD::OR)
- return false;
-
- SDValue Acc;
- if (SearchSignedMulLong(Src0, &Opc, A, B, true)) {
- Acc = Src1;
- } else if (SearchSignedMulLong(Src1, &Opc, A, B, true)) {
- Acc = Src0;
- } else {
- return false;
- }
- if (Opc == 0)
- return false;
-
- SDValue Ops[] = { A, B, Acc, getAL(CurDAG, dl),
- CurDAG->getRegister(0, MVT::i32) };
- CurDAG->SelectNodeTo(N, Opc, MVT::i32, MVT::Other, Ops);
- return true;
- } else if (N->getOpcode() == ISD::OR &&
- SearchSignedMulLong(SDValue(N, 0), &Opc, A, B, false)) {
- if (Opc == 0)
- return false;
-
- SDValue Ops[] = { A, B, getAL(CurDAG, dl),
- CurDAG->getRegister(0, MVT::i32)};
- CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
- return true;
- }
- return false;
-}
-
/// We've got special pseudo-instructions for these
void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
unsigned Opcode;
switch (N->getOpcode()) {
default: break;
- case ISD::ADD:
- case ISD::OR:
- if (trySMLAWSMULW(N))
- return;
- break;
case ISD::WRITE_REGISTER:
if (tryWriteRegister(N))
return;
case ARMISD::UMAAL: return "ARMISD::UMAAL";
case ARMISD::UMLAL: return "ARMISD::UMLAL";
case ARMISD::SMLAL: return "ARMISD::SMLAL";
+ case ARMISD::SMULWB: return "ARMISD::SMULWB";
+ case ARMISD::SMULWT: return "ARMISD::SMULWT";
case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR";
case ARMISD::BFI: return "ARMISD::BFI";
case ARMISD::VORRIMM: return "ARMISD::VORRIMM";
// Lowering Code
//===----------------------------------------------------------------------===//
+static bool isSRL16(const SDValue &Op) {
+ if (Op.getOpcode() != ISD::SRL)
+ return false;
+ if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+ return Const->getZExtValue() == 16;
+ return false;
+}
+
+static bool isSRA16(const SDValue &Op) {
+ if (Op.getOpcode() != ISD::SRA)
+ return false;
+ if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+ return Const->getZExtValue() == 16;
+ return false;
+}
+
+static bool isSHL16(const SDValue &Op) {
+ if (Op.getOpcode() != ISD::SHL)
+ return false;
+ if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+ return Const->getZExtValue() == 16;
+ return false;
+}
+
+// Check for a signed 16-bit value. We special case SRA because it makes it
+// more simple when also looking for SRAs that aren't sign extending a
+// smaller value. Without the check, we'd need to take extra care with
+// checking order for some operations.
+static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
+ if (isSRA16(Op))
+ return isSHL16(Op.getOperand(0));
+ return DAG.ComputeNumSignBits(Op) == 17;
+}
+
/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
switch (CC) {
return SDValue();
}
+// Try combining OR nodes to SMULWB, SMULWT.
+static SDValue PerformORCombineToSMULWBT(SDNode *OR,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
+ if (!Subtarget->hasV6Ops() ||
+ (Subtarget->isThumb() &&
+ (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
+ return SDValue();
+
+ SDValue SRL = OR->getOperand(0);
+ SDValue SHL = OR->getOperand(1);
+
+ if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
+ SRL = OR->getOperand(1);
+ SHL = OR->getOperand(0);
+ }
+ if (!isSRL16(SRL) || !isSHL16(SHL))
+ return SDValue();
+
+ // The first operands to the shifts need to be the two results from the
+ // same smul_lohi node.
+ if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
+ SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
+ return SDValue();
+
+ SDNode *SMULLOHI = SRL.getOperand(0).getNode();
+ if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
+ SHL.getOperand(0) != SDValue(SMULLOHI, 1))
+ return SDValue();
+
+ // Now we have:
+ // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
+ // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
+ // For SMUWB the 16-bit value will signed extended somehow.
+ // For SMULWT only the SRA is required.
+ // Check both sides of SMUL_LOHI
+ SDValue OpS16 = SMULLOHI->getOperand(0);
+ SDValue OpS32 = SMULLOHI->getOperand(1);
+
+ SelectionDAG &DAG = DCI.DAG;
+ if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
+ OpS16 = OpS32;
+ OpS32 = SMULLOHI->getOperand(0);
+ }
+
+ SDLoc dl(OR);
+ unsigned Opcode = 0;
+ if (isS16(OpS16, DAG))
+ Opcode = ARMISD::SMULWB;
+ else if (isSRA16(OpS16)) {
+ Opcode = ARMISD::SMULWT;
+ OpS16 = OpS16->getOperand(0);
+ }
+ else
+ return SDValue();
+
+ SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
+ return SDValue(OR, 0);
+}
+
/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
static SDValue PerformORCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
// fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
return Result;
+ if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
+ return Result;
}
// The code below optimizes (or (and X, Y), Z).
return PerformVLDCombine(N, DCI);
case ARMISD::BUILD_VECTOR:
return PerformARMBUILD_VECTORCombine(N, DCI);
+ case ARMISD::SMULWB: {
+ unsigned BitWidth = N->getValueType(0).getSizeInBits();
+ APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
+ if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
+ return SDValue();
+ break;
+ }
+ case ARMISD::SMULWT: {
+ unsigned BitWidth = N->getValueType(0).getSizeInBits();
+ APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
+ if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
+ return SDValue();
+ break;
+ }
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
VMULLs, // ...signed
VMULLu, // ...unsigned
+ SMULWB, // Signed multiply word by half word, bottom
+ SMULWT, // Signed multiply word by half word, top
UMLAL, // 64bit Unsigned Accumulate Multiply
SMLAL, // 64bit Signed Accumulate Multiply
UMAAL, // 64-bit Unsigned Accumulate Accumulate Multiply
[SDNPHasChain, SDNPInGlue, SDNPOutGlue,
SDNPMayStore, SDNPMayLoad]>;
+def ARMsmulwb : SDNode<"ARMISD::SMULWB", SDTIntBinOp, []>;
+def ARMsmulwt : SDNode<"ARMISD::SMULWT", SDTIntBinOp, []>;
+
//===----------------------------------------------------------------------===//
// ARM Instruction Predicate Definitions.
//
def WB : AMulxyI<0b0001001, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm",
- []>,
+ [(set GPR:$Rd, (ARMsmulwb GPR:$Rn, GPR:$Rm))]>,
Requires<[IsARM, HasV5TE]>,
Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
def WT : AMulxyI<0b0001001, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm",
- []>,
+ [(set GPR:$Rd, (ARMsmulwt GPR:$Rn, GPR:$Rm))]>,
Requires<[IsARM, HasV5TE]>,
Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
}
def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd),
(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
- []>,
+ [(set GPRnopc:$Rd,
+ (add GPR:$Ra, (ARMsmulwb GPRnopc:$Rn, GPRnopc:$Rm)))]>,
Requires<[IsARM, HasV5TE, UseMulOps]>,
Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd),
(ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
- []>,
+ [(set GPRnopc:$Rd,
+ (add GPR:$Ra, (ARMsmulwt GPRnopc:$Rn, GPRnopc:$Rm)))]>,
Requires<[IsARM, HasV5TE, UseMulOps]>,
Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
}
def t2SMULTT : T2ThreeRegSMUL<0b001, 0b11, "smultt",
[(set rGPR:$Rd, (mul (sra rGPR:$Rn, (i32 16)),
(sra rGPR:$Rm, (i32 16))))]>;
-def t2SMULWB : T2ThreeRegSMUL<0b011, 0b00, "smulwb", []>;
-def t2SMULWT : T2ThreeRegSMUL<0b011, 0b01, "smulwt", []>;
+def t2SMULWB : T2ThreeRegSMUL<0b011, 0b00, "smulwb",
+ [(set rGPR:$Rd, (ARMsmulwb rGPR:$Rn, rGPR:$Rm))]>;
+def t2SMULWT : T2ThreeRegSMUL<0b011, 0b01, "smulwt",
+ [(set rGPR:$Rd, (ARMsmulwt rGPR:$Rn, rGPR:$Rm))]>;
def : Thumb2DSPPat<(mul sext_16_node:$Rm, sext_16_node:$Rn),
(t2SMULBB rGPR:$Rm, rGPR:$Rn)>;
def t2SMLATT : T2FourRegSMLA<0b001, 0b11, "smlatt",
[(set rGPR:$Rd, (add rGPR:$Ra, (mul (sra rGPR:$Rn, (i32 16)),
(sra rGPR:$Rm, (i32 16)))))]>;
-def t2SMLAWB : T2FourRegSMLA<0b011, 0b00, "smlawb", []>;
-def t2SMLAWT : T2FourRegSMLA<0b011, 0b01, "smlawt", []>;
+def t2SMLAWB : T2FourRegSMLA<0b011, 0b00, "smlawb",
+ [(set rGPR:$Rd, (add rGPR:$Ra, (ARMsmulwb rGPR:$Rn, rGPR:$Rm)))]>;
+def t2SMLAWT : T2FourRegSMLA<0b011, 0b01, "smlawt",
+ [(set rGPR:$Rd, (add rGPR:$Ra, (ARMsmulwt rGPR:$Rn, rGPR:$Rm)))]>;
def : Thumb2DSPMulPat<(add rGPR:$Ra, (mul sext_16_node:$Rn, sext_16_node:$Rm)),
(t2SMLABB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>;
%tmp5 = add i32 %a, %tmp4
ret i32 %tmp5
}
+
+@global_b = external global i16, align 2
+
+define i32 @f22(i32 %a) {
+; CHECK-LABEL: f22:
+; CHECK: smulwb r0, r0, r1
+; CHECK-THUMBV6-NOT: smulwb
+ %b = load i16, i16* @global_b, align 2
+ %sext = sext i16 %b to i64
+ %conv = sext i32 %a to i64
+ %mul = mul nsw i64 %sext, %conv
+ %shr37 = lshr i64 %mul, 16
+ %conv4 = trunc i64 %shr37 to i32
+ ret i32 %conv4
+}
+
+define i32 @f23(i32 %a, i32 %c) {
+; CHECK-LABEL: f23:
+; CHECK: smlawb r0, r0, r2, r1
+; CHECK-THUMBV6-NOT: smlawb
+ %b = load i16, i16* @global_b, align 2
+ %sext = sext i16 %b to i64
+ %conv = sext i32 %a to i64
+ %mul = mul nsw i64 %sext, %conv
+ %shr49 = lshr i64 %mul, 16
+ %conv5 = trunc i64 %shr49 to i32
+ %add = add nsw i32 %conv5, %c
+ ret i32 %add
+}