From: Sam Parker Date: Mon, 20 Jun 2016 16:47:09 +0000 (+0000) Subject: [ARM] Enable isel of UMAAL X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a02f69b1c10ad38d6f9e6f1cf189d6279df7227a;p=llvm [ARM] Enable isel of UMAAL TargetLowering and DAGToDAG are used to combine ADDC, ADDE and UMLAL dags into UMAAL. Selection is split into the two phases because it is easier to match the two patterns at those different times. Differential Revision: http://http://reviews.llvm.org/D21461 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@273165 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index a6087d663a3..7360210ff99 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -2939,7 +2939,47 @@ void ARMDAGToDAGISel::Select(SDNode *N) { return; } } + case ARMISD::UMAAL: { + unsigned Opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL; + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + N->getOperand(2), N->getOperand(3), + getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32) }; + ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, MVT::i32, MVT::i32, Ops)); + return; + } case ARMISD::UMLAL:{ + // UMAAL is similar to UMLAL but it adds two 32-bit values to the + // 64-bit multiplication result. + if (Subtarget->hasV6Ops() && N->getOperand(2).getOpcode() == ARMISD::ADDC && + N->getOperand(3).getOpcode() == ARMISD::ADDE) { + + SDValue Addc = N->getOperand(2); + SDValue Adde = N->getOperand(3); + + if (Adde.getOperand(2).getNode() == Addc.getNode()) { + + ConstantSDNode *Op0 = dyn_cast(Adde.getOperand(0)); + ConstantSDNode *Op1 = dyn_cast(Adde.getOperand(1)); + + if (Op0 && Op1 && Op0->getZExtValue() == 0 && Op1->getZExtValue() == 0) + { + // Select UMAAL instead: UMAAL RdLo, RdHi, Rn, Rm + // RdLo = one operand to be added, lower 32-bits of res + // RdHi = other operand to be added, upper 32-bits of res + // Rn = first multiply operand + // Rm = second multiply operand + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + Addc.getOperand(0), Addc.getOperand(1), + getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32) }; + unsigned opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL; + CurDAG->SelectNodeTo(N, opc, MVT::i32, MVT::i32, Ops); + return; + } + } + } + if (Subtarget->isThumb()) { SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3), getAL(CurDAG, dl), diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 18a583ffc7b..26b4a1c98df 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -1212,6 +1212,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VTBL2: return "ARMISD::VTBL2"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; + case ARMISD::UMAAL: return "ARMISD::UMAAL"; case ARMISD::UMLAL: return "ARMISD::UMLAL"; case ARMISD::SMLAL: return "ARMISD::SMLAL"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; @@ -8686,11 +8687,6 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { - if (Subtarget->isThumb1Only()) return SDValue(); - - // Only perform the checks after legalize when the pattern is available. - if (DCI.isBeforeLegalize()) return SDValue(); - // Look for multiply add opportunities. // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where // each add nodes consumes a value from ISD::UMUL_LOHI and there is @@ -8818,14 +8814,97 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode, return resNode; } +static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + // UMAAL is similar to UMLAL except that it adds two unsigned values. + // While trying to combine for the other MLAL nodes, first search for the + // chance to use UMAAL. Check if Addc uses another addc node which can first + // be combined into a UMLAL. The other pattern is AddcNode being combined + // into an UMLAL and then using another addc is handled in ISelDAGToDAG. + + if (!Subtarget->hasV6Ops()) + return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget); + + SDNode *PrevAddc = nullptr; + if (AddcNode->getOperand(0).getOpcode() == ISD::ADDC) + PrevAddc = AddcNode->getOperand(0).getNode(); + else if (AddcNode->getOperand(1).getOpcode() == ISD::ADDC) + PrevAddc = AddcNode->getOperand(1).getNode(); + + // If there's no addc chains, just return a search for any MLAL. + if (PrevAddc == nullptr) + return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget); + + // Try to convert the addc operand to an MLAL and if that fails try to + // combine AddcNode. + SDValue MLAL = AddCombineTo64bitMLAL(PrevAddc, DCI, Subtarget); + if (MLAL != SDValue(PrevAddc, 0)) + return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget); + + // Find the converted UMAAL or quit if it doesn't exist. + SDNode *UmlalNode = nullptr; + SDValue AddHi; + if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { + UmlalNode = AddcNode->getOperand(0).getNode(); + AddHi = AddcNode->getOperand(1); + } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { + UmlalNode = AddcNode->getOperand(1).getNode(); + AddHi = AddcNode->getOperand(0); + } else { + return SDValue(); + } + + // The ADDC should be glued to an ADDE node, which uses the same UMLAL as + // the ADDC as well as Zero. + auto *Zero = dyn_cast(UmlalNode->getOperand(3)); + + if (!Zero || Zero->getZExtValue() != 0) + return SDValue(); + + // Check that we have a glued ADDC node. + if (AddcNode->getValueType(1) != MVT::Glue) + return SDValue(); + + // Look for the glued ADDE. + SDNode* AddeNode = AddcNode->getGluedUser(); + if (!AddeNode) + return SDValue(); + + if ((AddeNode->getOperand(0).getNode() == Zero && + AddeNode->getOperand(1).getNode() == UmlalNode) || + (AddeNode->getOperand(0).getNode() == UmlalNode && + AddeNode->getOperand(1).getNode() == Zero)) { + + SelectionDAG &DAG = DCI.DAG; + SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), + UmlalNode->getOperand(2), AddHi }; + SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), + DAG.getVTList(MVT::i32, MVT::i32), Ops); + + // Replace the ADDs' nodes uses by the UMAAL node's values. + DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); + + // Return original node to notify the driver to stop replacing. + return SDValue(AddcNode, 0); + } + return SDValue(); +} + /// PerformADDCCombine - Target-specific dag combine transform from -/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL. +/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL or +/// ISD::ADDC, ISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL static SDValue PerformADDCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { - return AddCombineTo64bitMLAL(N, DCI, Subtarget); + if (Subtarget->isThumb1Only()) return SDValue(); + + // Only perform the checks after legalize when the pattern is available. + if (DCI.isBeforeLegalize()) return SDValue(); + return AddCombineTo64bitUMAAL(N, DCI, Subtarget); } /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 85b820c9f6c..4cdc6182040 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -163,6 +163,7 @@ namespace llvm { UMLAL, // 64bit Unsigned Accumulate Multiply SMLAL, // 64bit Signed Accumulate Multiply + UMAAL, // 64-bit Unsigned Accumulate Accumulate Multiply // Operands of the standard BUILD_VECTOR node are not legalized, which // is fine if BUILD_VECTORs are always lowered to shuffles or other diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index fa18ecd2764..b57f3e84d3b 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -95,6 +95,7 @@ def SDT_ARM64bitmlal : SDTypeProfile<2,4, [ SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<4, i32>, SDTCisVT<5, i32> ] >; def ARMUmlal : SDNode<"ARMISD::UMLAL", SDT_ARM64bitmlal>; def ARMSmlal : SDNode<"ARMISD::SMLAL", SDT_ARM64bitmlal>; +def ARMUmaal : SDNode<"ARMISD::UMAAL", SDT_ARM64bitmlal>; // Node definitions. def ARMWrapper : SDNode<"ARMISD::Wrapper", SDTIntUnaryOp>; @@ -3950,9 +3951,10 @@ def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi), RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>; def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi), - (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64, + (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), + IIC_iMAC64, "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsARM, HasV6]> { + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]> { bits<4> RdLo; bits<4> RdHi; bits<4> Rm; diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index c642ad90c2d..22aca239565 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -2593,8 +2593,9 @@ def t2UMLAL : T2MlaLong<0b110, 0b0000, def t2UMAAL : T2MulLong<0b110, 0b0110, (outs rGPR:$RdLo, rGPR:$RdHi), - (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, + (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64, "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, + RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsThumb2, HasDSP]>; } // hasSideEffects diff --git a/test/CodeGen/ARM/longMAC.ll b/test/CodeGen/ARM/longMAC.ll index 3f30fd40b7e..cc2d745aae8 100644 --- a/test/CodeGen/ARM/longMAC.ll +++ b/test/CodeGen/ARM/longMAC.ll @@ -116,3 +116,32 @@ define i64 @MACLongTest8(i64 %acc, i32 %lhs, i32 %rhs) { ret i64 %add } +define i64 @MACLongTest9(i32 %lhs, i32 %rhs, i32 %lo, i32 %hi) { +;CHECK-LABEL: MACLongTest9: +;CHECK-V7-LE:umaal +;CHECK-V7-BE:umaal +;CHECK-NOT:umaal + %conv = zext i32 %lhs to i64 + %conv1 = zext i32 %rhs to i64 + %mul = mul nuw i64 %conv1, %conv + %conv2 = zext i32 %lo to i64 + %add = add i64 %mul, %conv2 + %conv3 = zext i32 %hi to i64 + %add2 = add i64 %add, %conv3 + ret i64 %add2 +} + +define i64 @MACLongTest10(i32 %lhs, i32 %rhs, i32 %lo, i32 %hi) { +;CHECK-LABEL: MACLongTest10: +;CHECK-V7-LE:umaal +;CHECK-V7-BE:umaal +;CHECK-NOT:umaal + %conv = zext i32 %lhs to i64 + %conv1 = zext i32 %rhs to i64 + %mul = mul nuw i64 %conv1, %conv + %conv2 = zext i32 %lo to i64 + %conv3 = zext i32 %hi to i64 + %add = add i64 %conv2, %conv3 + %add2 = add i64 %add, %mul + ret i64 %add2 +}