addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
}
-void ARMTargetLowering::addMVEVectorTypes() {
+void ARMTargetLowering::setAllExpand(MVT VT) {
+ for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+ setOperationAction(Opc, VT, Expand);
+}
+
+void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
+ const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
+
+ for (auto VT : IntTypes) {
+ addRegisterClass(VT, &ARM::QPRRegClass);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ }
+
+ const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
+ for (auto VT : FloatTypes) {
+ addRegisterClass(VT, &ARM::QPRRegClass);
+ if (!HasMVEFP)
+ setAllExpand(VT);
+
+ // These are legal or custom whether we have MVE.fp or not
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
+ setOperationAction(ISD::BITCAST, VT, Legal);
+ setOperationAction(ISD::LOAD, VT, Legal);
+ setOperationAction(ISD::STORE, VT, Legal);
+ }
+
// We 'support' these types up to bitcast/load/store level, regardless of
// MVE integer-only / float support. Only doing FP data processing on the FP
// vector types is inhibited at integer-only level.
-
- const MVT VecTypes[] = {
- MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8,
- MVT::v2f64, MVT::v4f32, MVT::v8f16,
- };
-
- for (auto VT : VecTypes) {
+ const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
+ for (auto VT : LongTypes) {
addRegisterClass(VT, &ARM::QPRRegClass);
- for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
- setOperationAction(Opc, VT, Expand);
+ setAllExpand(VT);
setOperationAction(ISD::BITCAST, VT, Legal);
setOperationAction(ISD::LOAD, VT, Legal);
setOperationAction(ISD::STORE, VT, Legal);
setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
if (Subtarget->hasMVEIntegerOps())
- addMVEVectorTypes();
+ addMVEVectorTypes(Subtarget->hasMVEFloatOps());
if (Subtarget->hasNEON()) {
addDRTypeForNEON(MVT::v2f32);
if (SplatUndef.isAllOnesValue())
return DAG.getUNDEF(VT);
- if (SplatBitSize <= 64) {
+ if (ST->hasNEON() && SplatBitSize <= 64) {
// Check if an immediate VMOV works.
EVT VmovVT;
SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
}
if (VT.getVectorElementType().isFloatingPoint()) {
SmallVector<SDValue, 8> Ops;
+ MVT FVT = VT.getVectorElementType().getSimpleVT();
+ assert(FVT == MVT::f32 || FVT == MVT::f16);
+ MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
for (unsigned i = 0; i < NumElts; ++i)
- Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
+ Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
Op.getOperand(i)));
- EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+ EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
Val = LowerBUILD_VECTOR(Val, DAG, ST);
if (Val.getNode())
return shuffle;
}
- if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
+ if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
// If we haven't found an efficient lowering, try splitting a 128-bit vector
// into two 64-bit vectors; we might discover a better way to lower it.
SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
// Vectors with 32- or 64-bit elements can be built by directly assigning
// the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
// will be legalized.
- if (EltSize >= 32) {
+ if (ST->hasNEON() && EltSize >= 32) {
// Do the expansion with floating-point types, since that is what the VFP
// registers are defined to use, and since i64 is not legal.
EVT EltVT = EVT::getFloatingPointVT(EltSize);
return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
}
+enum ShuffleOpCodes {
+ OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+ OP_VREV,
+ OP_VDUP0,
+ OP_VDUP1,
+ OP_VDUP2,
+ OP_VDUP3,
+ OP_VEXT1,
+ OP_VEXT2,
+ OP_VEXT3,
+ OP_VUZPL, // VUZP, left result
+ OP_VUZPR, // VUZP, right result
+ OP_VZIPL, // VZIP, left result
+ OP_VZIPR, // VZIP, right result
+ OP_VTRNL, // VTRN, left result
+ OP_VTRNR // VTRN, right result
+};
+
+static bool isLegalMVEShuffleOp(unsigned PFEntry) {
+ unsigned OpNum = (PFEntry >> 26) & 0x0F;
+ switch (OpNum) {
+ case OP_COPY:
+ case OP_VREV:
+ case OP_VDUP0:
+ case OP_VDUP1:
+ case OP_VDUP2:
+ case OP_VDUP3:
+ return true;
+ }
+ return false;
+}
+
/// isShuffleMaskLegal - Targets can use this to indicate that they only
/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
unsigned Cost = (PFEntry >> 30);
- if (Cost <= 4)
+ if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
return true;
}
unsigned Imm, WhichResult;
unsigned EltSize = VT.getScalarSizeInBits();
- return (EltSize >= 32 ||
- ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
- isVREVMask(M, VT, 64) ||
- isVREVMask(M, VT, 32) ||
- isVREVMask(M, VT, 16) ||
- isVEXTMask(M, VT, ReverseVEXT, Imm) ||
- isVTBLMask(M, VT) ||
- isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) ||
- ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
+ if (EltSize >= 32 ||
+ ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+ isVREVMask(M, VT, 64) ||
+ isVREVMask(M, VT, 32) ||
+ isVREVMask(M, VT, 16))
+ return true;
+ else if (Subtarget->hasNEON() &&
+ (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
+ isVTBLMask(M, VT) ||
+ isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
+ return true;
+ else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) &&
+ isReverseMask(M, VT))
+ return true;
+ else
+ return false;
}
/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
- enum {
- OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
- OP_VREV,
- OP_VDUP0,
- OP_VDUP1,
- OP_VDUP2,
- OP_VDUP3,
- OP_VEXT1,
- OP_VEXT2,
- OP_VEXT3,
- OP_VUZPL, // VUZP, left result
- OP_VUZPR, // VUZP, right result
- OP_VZIPL, // VZIP, left result
- OP_VZIPR, // VZIP, right result
- OP_VTRNL, // VTRN, left result
- OP_VTRNR // VTRN, right result
- };
-
if (OpNum == OP_COPY) {
if (LHSID == (1*9+2)*9+3) return LHS;
assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
DAG.getConstant(ExtractNum, DL, MVT::i32));
}
-static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
+ const ARMSubtarget *ST) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
SDLoc dl(Op);
bool ReverseVEXT = false;
unsigned Imm = 0;
- if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
+ if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
if (ReverseVEXT)
std::swap(V1, V2);
return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
if (isVREVMask(ShuffleMask, VT, 16))
return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
- if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
+ if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
DAG.getConstant(Imm, dl, MVT::i32));
}
// used for both shuffles.
unsigned WhichResult = 0;
bool isV_UNDEF = false;
- if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
- ShuffleMask, VT, WhichResult, isV_UNDEF)) {
- if (isV_UNDEF)
- V2 = V1;
- return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
- .getValue(WhichResult);
+ if (ST->hasNEON()) {
+ if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
+ ShuffleMask, VT, WhichResult, isV_UNDEF)) {
+ if (isV_UNDEF)
+ V2 = V1;
+ return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
+ .getValue(WhichResult);
+ }
}
// Also check for these shuffles through CONCAT_VECTORS: we canonicalize
// ->
// concat(VZIP(v1, v2):0, :1)
//
- if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
+ if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
SDValue SubV1 = V1->getOperand(0);
SDValue SubV2 = V1->getOperand(1);
EVT SubVT = SubV1.getValueType();
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
unsigned Cost = (PFEntry >> 30);
- if (Cost <= 4)
- return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+ if (Cost <= 4) {
+ if (ST->hasNEON())
+ return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+ else if (isLegalMVEShuffleOp(PFEntry)) {
+ unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
+ unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
+ unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
+ unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
+ if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
+ return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+ }
+ }
}
// Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
return DAG.getNode(ISD::BITCAST, dl, VT, Val);
}
- if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
+ if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
- if (VT == MVT::v8i8)
+ if (ST->hasNEON() && VT == MVT::v8i8)
if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
return NewOp;
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
- case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformVDUPCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const ARMSubtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
SDValue Op = N->getOperand(0);
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
// Match VDUP(LOAD) -> VLD1DUP.
// We match this pattern here rather than waiting for isel because the
// transform is only legal for unindexed loads.
case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
- case ARMISD::VDUP: return PerformVDUPCombine(N, DCI);
+ case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
return PerformVCVTCombine(N, DCI.DAG, Subtarget);
MachineBasicBlock *MBB) const;
MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI,
MachineBasicBlock *MBB) const;
- void addMVEVectorTypes();
+ void addMVEVectorTypes(bool HasMVEFP);
+ void setAllExpand(MVT VT);
};
enum NEONModImmType {
def ARMsmlaltb : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>;
def ARMsmlaltt : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>;
+// Vector operations shared between NEON and MVE
+
+def ARMvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
+
+// VDUPLANE can produce a quad-register result from a double-register source,
+// so the result is not constrained to match the source.
+def ARMvduplane : SDNode<"ARMISD::VDUPLANE",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisVT<2, i32>]>>;
+
+def SDTARMVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
+def ARMvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>;
+def ARMvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>;
+def ARMvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>;
+
+def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
+ SDTCisVT<2, i32>]>;
+def ARMvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
+def ARMvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
+
//===----------------------------------------------------------------------===//
// ARM Flag Definitions.
def MVE_VREV16_8 : MVE_VREV<"vrev16", "8", 0b00, 0b10>;
+let Predicates = [HasMVEInt] in {
+def : Pat<(v4i32 (ARMvrev64 (v4i32 MQPR:$src))),
+ (v4i32 (MVE_VREV64_32 (v4i32 MQPR:$src)))>;
+def : Pat<(v8i16 (ARMvrev64 (v8i16 MQPR:$src))),
+ (v8i16 (MVE_VREV64_16 (v8i16 MQPR:$src)))>;
+def : Pat<(v16i8 (ARMvrev64 (v16i8 MQPR:$src))),
+ (v16i8 (MVE_VREV64_8 (v16i8 MQPR:$src)))>;
+
+def : Pat<(v8i16 (ARMvrev32 (v8i16 MQPR:$src))),
+ (v8i16 (MVE_VREV32_16 (v8i16 MQPR:$src)))>;
+def : Pat<(v16i8 (ARMvrev32 (v16i8 MQPR:$src))),
+ (v16i8 (MVE_VREV32_8 (v16i8 MQPR:$src)))>;
+
+def : Pat<(v16i8 (ARMvrev16 (v16i8 MQPR:$src))),
+ (v16i8 (MVE_VREV16_8 (v16i8 MQPR:$src)))>;
+
+def : Pat<(v4f32 (ARMvrev64 (v4f32 MQPR:$src))),
+ (v4f32 (MVE_VREV64_32 (v4f32 MQPR:$src)))>;
+def : Pat<(v8f16 (ARMvrev64 (v8f16 MQPR:$src))),
+ (v8f16 (MVE_VREV64_16 (v8f16 MQPR:$src)))>;
+def : Pat<(v8f16 (ARMvrev32 (v8f16 MQPR:$src))),
+ (v8f16 (MVE_VREV32_16 (v8f16 MQPR:$src)))>;
+}
+
def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm),
"vmvn", "", "$Qd, $Qm", ""> {
let Inst{28} = 0b1;
def MVE_VMOV_from_lane_u8 : MVE_VMOV_lane_8 < "u8", 0b1, MVE_VMOV_from_lane>;
def MVE_VMOV_to_lane_8 : MVE_VMOV_lane_8 < "8", 0b0, MVE_VMOV_to_lane>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(extractelt (v4i32 MQPR:$src), imm:$lane),
+ (COPY_TO_REGCLASS
+ (i32 (EXTRACT_SUBREG MQPR:$src, (SSubReg_f32_reg imm:$lane))), rGPR)>;
+ def : Pat<(insertelt (v4i32 MQPR:$src1), rGPR:$src2, imm:$lane),
+ (MVE_VMOV_to_lane_32 MQPR:$src1, rGPR:$src2, imm:$lane)>;
+
+ def : Pat<(vector_insert (v16i8 MQPR:$src1), rGPR:$src2, imm:$lane),
+ (MVE_VMOV_to_lane_8 MQPR:$src1, rGPR:$src2, imm:$lane)>;
+ def : Pat<(vector_insert (v8i16 MQPR:$src1), rGPR:$src2, imm:$lane),
+ (MVE_VMOV_to_lane_16 MQPR:$src1, rGPR:$src2, imm:$lane)>;
+
+ def : Pat<(ARMvgetlanes (v16i8 MQPR:$src), imm:$lane),
+ (MVE_VMOV_from_lane_s8 MQPR:$src, imm:$lane)>;
+ def : Pat<(ARMvgetlanes (v8i16 MQPR:$src), imm:$lane),
+ (MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>;
+ def : Pat<(ARMvgetlaneu (v16i8 MQPR:$src), imm:$lane),
+ (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane)>;
+ def : Pat<(ARMvgetlaneu (v8i16 MQPR:$src), imm:$lane),
+ (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>;
+
+ def : Pat<(v16i8 (scalar_to_vector GPR:$src)),
+ (MVE_VMOV_to_lane_8 (v16i8 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
+ def : Pat<(v8i16 (scalar_to_vector GPR:$src)),
+ (MVE_VMOV_to_lane_16 (v8i16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
+ def : Pat<(v4i32 (scalar_to_vector GPR:$src)),
+ (MVE_VMOV_to_lane_32 (v4i32 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
+
+ // Floating point patterns, still enabled under HasMVEInt
+ def : Pat<(extractelt (v4f32 MQPR:$src), imm:$lane),
+ (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG MQPR:$src, (SSubReg_f32_reg imm:$lane))), SPR)>;
+ def : Pat<(insertelt (v4f32 MQPR:$src1), (f32 SPR:$src2), imm:$lane),
+ (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), SPR:$src2, (SSubReg_f32_reg imm:$lane))>;
+
+ def : Pat<(insertelt (v8f16 MQPR:$src1), HPR:$src2, imm:$lane),
+ (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS HPR:$src2, rGPR), imm:$lane)>;
+ def : Pat<(extractelt (v8f16 MQPR:$src), imm:$lane),
+ (COPY_TO_REGCLASS (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane), HPR)>;
+
+ def : Pat<(v4f32 (scalar_to_vector SPR:$src)),
+ (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
+ def : Pat<(v4f32 (scalar_to_vector GPR:$src)),
+ (MVE_VMOV_to_lane_32 (v4f32 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
+ def : Pat<(v8f16 (scalar_to_vector HPR:$src)),
+ (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), HPR:$src, ssub_0)>;
+ def : Pat<(v8f16 (scalar_to_vector GPR:$src)),
+ (MVE_VMOV_to_lane_16 (v8f16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
+}
+
// end of mve_bit instructions
// start of MVE Integer instructions
def MVE_VDUP16 : MVE_VDUP<"16", 0b0, 0b1>;
def MVE_VDUP8 : MVE_VDUP<"8", 0b1, 0b0>;
+let Predicates = [HasMVEInt] in {
+ def : Pat<(v16i8 (ARMvdup (i32 rGPR:$elem))),
+ (MVE_VDUP8 rGPR:$elem)>;
+ def : Pat<(v8i16 (ARMvdup (i32 rGPR:$elem))),
+ (MVE_VDUP16 rGPR:$elem)>;
+ def : Pat<(v4i32 (ARMvdup (i32 rGPR:$elem))),
+ (MVE_VDUP32 rGPR:$elem)>;
+
+ def : Pat<(v4i32 (ARMvduplane (v4i32 MQPR:$src), imm:$lane)),
+ (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>;
+ // For the 16-bit and 8-bit vduplanes we don't care about the signedness
+ // of the lane move operation as we only want the lowest 8/16 bits anyway.
+ def : Pat<(v8i16 (ARMvduplane (v8i16 MQPR:$src), imm:$lane)),
+ (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>;
+ def : Pat<(v16i8 (ARMvduplane (v16i8 MQPR:$src), imm:$lane)),
+ (MVE_VDUP8 (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane))>;
+
+ def : Pat<(v4f32 (ARMvdup (f32 SPR:$elem))),
+ (v4f32 (MVE_VDUP32 (i32 (COPY_TO_REGCLASS (f32 SPR:$elem), rGPR))))>;
+ def : Pat<(v8f16 (ARMvdup (f16 HPR:$elem))),
+ (v8f16 (MVE_VDUP16 (i32 (COPY_TO_REGCLASS (f16 HPR:$elem), rGPR))))>;
+
+ def : Pat<(v4f32 (ARMvduplane (v4f32 MQPR:$src), imm:$lane)),
+ (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>;
+ def : Pat<(v8f16 (ARMvduplane (v8f16 MQPR:$src), imm:$lane)),
+ (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>;
+}
+
+
class MVEIntSingleSrc<string iname, string suffix, bits<2> size,
list<dag> pattern=[]>
: MVE_p<(outs MQPR:$Qd), (ins MQPR:$Qm), NoItinerary,
def NEONvsli : SDNode<"ARMISD::VSLI", SDTARMVSHINS>;
def NEONvsri : SDNode<"ARMISD::VSRI", SDTARMVSHINS>;
-def SDTARMVGETLN : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
- SDTCisVT<2, i32>]>;
-def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
-def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
-
def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
def NEONvmovImm : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>;
def NEONvmvnImm : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>;
SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>]>>;
-def NEONvdup : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
-
-// VDUPLANE can produce a quad-register result from a double-register source,
-// so the result is not constrained to match the source.
-def NEONvduplane : SDNode<"ARMISD::VDUPLANE",
- SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
- SDTCisVT<2, i32>]>>;
-
def SDTARMVEXT : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
def NEONvext : SDNode<"ARMISD::VEXT", SDTARMVEXT>;
-def SDTARMVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
-def NEONvrev64 : SDNode<"ARMISD::VREV64", SDTARMVSHUF>;
-def NEONvrev32 : SDNode<"ARMISD::VREV32", SDTARMVSHUF>;
-def NEONvrev16 : SDNode<"ARMISD::VREV16", SDTARMVSHUF>;
-
def SDTARMVSHUF2 : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>,
SDTCisSameAs<0, 3>]>;
(ins AddrMode:$Rn),
IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "",
[(set VecListOneDAllLanes:$Vd,
- (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]>,
+ (Ty (ARMvdup (i32 (LoadOp AddrMode:$Rn)))))]>,
Sched<[WriteVLD2]> {
let Rm = 0b1111;
let Inst{4} = Rn{4};
addrmode6dupalign32>;
let Predicates = [HasNEON] in {
-def : Pat<(v2f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
+def : Pat<(v2f32 (ARMvdup (f32 (load addrmode6dup:$addr)))),
(VLD1DUPd32 addrmode6:$addr)>;
}
(ins AddrMode:$Rn), IIC_VLD1dup,
"vld1", Dt, "$Vd, $Rn", "",
[(set VecListDPairAllLanes:$Vd,
- (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> {
+ (Ty (ARMvdup (i32 (LoadOp AddrMode:$Rn)))))]> {
let Rm = 0b1111;
let Inst{4} = Rn{4};
let DecoderMethod = "DecodeVLD1DupInstruction";
addrmode6dupalign32>;
let Predicates = [HasNEON] in {
-def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
+def : Pat<(v4f32 (ARMvdup (f32 (load addrmode6dup:$addr)))),
(VLD1DUPq32 addrmode6:$addr)>;
}
}
def VST1LNd8 : VST1LN<0b0000, {?,?,?,0}, "8", v8i8, truncstorei8,
- NEONvgetlaneu, addrmode6> {
+ ARMvgetlaneu, addrmode6> {
let Inst{7-5} = lane{2-0};
}
def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16,
- NEONvgetlaneu, addrmode6> {
+ ARMvgetlaneu, addrmode6> {
let Inst{7-6} = lane{1-0};
let Inst{4} = Rn{4};
}
let Inst{5-4} = Rn{5-4};
}
-def VST1LNq8Pseudo : VST1QLNPseudo<v16i8, truncstorei8, NEONvgetlaneu>;
-def VST1LNq16Pseudo : VST1QLNPseudo<v8i16, truncstorei16, NEONvgetlaneu>;
+def VST1LNq8Pseudo : VST1QLNPseudo<v16i8, truncstorei8, ARMvgetlaneu>;
+def VST1LNq16Pseudo : VST1QLNPseudo<v8i16, truncstorei16, ARMvgetlaneu>;
def VST1LNq32Pseudo : VST1QLNPseudo<v4i32, store, extractelt>;
let Predicates = [HasNEON] in {
}
def VST1LNd8_UPD : VST1LNWB<0b0000, {?,?,?,0}, "8", v8i8, post_truncsti8,
- NEONvgetlaneu, addrmode6> {
+ ARMvgetlaneu, addrmode6> {
let Inst{7-5} = lane{2-0};
}
def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16,
- NEONvgetlaneu, addrmode6> {
+ ARMvgetlaneu, addrmode6> {
let Inst{7-6} = lane{1-0};
let Inst{4} = Rn{4};
}
let Inst{5-4} = Rn{5-4};
}
-def VST1LNq8Pseudo_UPD : VST1QLNWBPseudo<v16i8, post_truncsti8, NEONvgetlaneu>;
-def VST1LNq16Pseudo_UPD : VST1QLNWBPseudo<v8i16, post_truncsti16,NEONvgetlaneu>;
+def VST1LNq8Pseudo_UPD : VST1QLNWBPseudo<v16i8, post_truncsti8, ARMvgetlaneu>;
+def VST1LNq16Pseudo_UPD : VST1QLNWBPseudo<v8i16, post_truncsti16,ARMvgetlaneu>;
def VST1LNq32Pseudo_UPD : VST1QLNWBPseudo<v4i32, post_store, extractelt>;
let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in {
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set (Ty DPR:$Vd),
(Ty (ShOp (Ty DPR:$Vn),
- (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> {
+ (Ty (ARMvduplane (Ty DPR_VFP2:$Vm),imm:$lane)))))]> {
// All of these have a two-operand InstAlias.
let TwoOperandAliasConstraint = "$Vn = $Vd";
let isCommutable = 0;
NVMulSLFrm, IIC_VMULi16D, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane","",
[(set (Ty DPR:$Vd),
(Ty (ShOp (Ty DPR:$Vn),
- (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
+ (Ty (ARMvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
// All of these have a two-operand InstAlias.
let TwoOperandAliasConstraint = "$Vn = $Vd";
let isCommutable = 0;
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set (ResTy QPR:$Vd),
(ResTy (ShOp (ResTy QPR:$Vn),
- (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+ (ResTy (ARMvduplane (OpTy DPR_VFP2:$Vm),
imm:$lane)))))]> {
// All of these have a two-operand InstAlias.
let TwoOperandAliasConstraint = "$Vn = $Vd";
NVMulSLFrm, IIC_VMULi16Q, OpcodeStr, Dt,"$Vd, $Vn, $Vm$lane", "",
[(set (ResTy QPR:$Vd),
(ResTy (ShOp (ResTy QPR:$Vn),
- (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+ (ResTy (ARMvduplane (OpTy DPR_8:$Vm),
imm:$lane)))))]> {
// All of these have a two-operand InstAlias.
let TwoOperandAliasConstraint = "$Vn = $Vd";
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set (Ty DPR:$Vd),
(Ty (IntOp (Ty DPR:$Vn),
- (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),
+ (Ty (ARMvduplane (Ty DPR_VFP2:$Vm),
imm:$lane)))))]> {
let isCommutable = 0;
}
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set (Ty DPR:$Vd),
(Ty (IntOp (Ty DPR:$Vn),
- (Ty (NEONvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
+ (Ty (ARMvduplane (Ty DPR_8:$Vm), imm:$lane)))))]> {
let isCommutable = 0;
}
class N3VDIntSh<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set (ResTy QPR:$Vd),
(ResTy (IntOp (ResTy QPR:$Vn),
- (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+ (ResTy (ARMvduplane (OpTy DPR_VFP2:$Vm),
imm:$lane)))))]> {
let isCommutable = 0;
}
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set (ResTy QPR:$Vd),
(ResTy (IntOp (ResTy QPR:$Vn),
- (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+ (ResTy (ARMvduplane (OpTy DPR_8:$Vm),
imm:$lane)))))]> {
let isCommutable = 0;
}
[(set (Ty DPR:$Vd),
(Ty (ShOp (Ty DPR:$src1),
(Ty (MulOp DPR:$Vn,
- (Ty (NEONvduplane (Ty DPR_VFP2:$Vm),
+ (Ty (ARMvduplane (Ty DPR_VFP2:$Vm),
imm:$lane)))))))]>;
class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
string OpcodeStr, string Dt,
[(set (Ty DPR:$Vd),
(Ty (ShOp (Ty DPR:$src1),
(Ty (MulOp DPR:$Vn,
- (Ty (NEONvduplane (Ty DPR_8:$Vm),
+ (Ty (ARMvduplane (Ty DPR_8:$Vm),
imm:$lane)))))))]>;
class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
[(set (ResTy QPR:$Vd),
(ResTy (ShOp (ResTy QPR:$src1),
(ResTy (MulOp QPR:$Vn,
- (ResTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+ (ResTy (ARMvduplane (OpTy DPR_VFP2:$Vm),
imm:$lane)))))))]>;
class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
string OpcodeStr, string Dt,
[(set (ResTy QPR:$Vd),
(ResTy (ShOp (ResTy QPR:$src1),
(ResTy (MulOp QPR:$Vn,
- (ResTy (NEONvduplane (OpTy DPR_8:$Vm),
+ (ResTy (ARMvduplane (OpTy DPR_8:$Vm),
imm:$lane)))))))]>;
// Neon Intrinsic-Op instructions (VABA): double- and quad-register.
[(set QPR:$Vd,
(OpNode (TyQ QPR:$src1),
(TyQ (MulOp (TyD DPR:$Vn),
- (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),
+ (TyD (ARMvduplane (TyD DPR_VFP2:$Vm),
imm:$lane))))))]>;
class N3VLMulOpSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
InstrItinClass itin, string OpcodeStr, string Dt,
[(set QPR:$Vd,
(OpNode (TyQ QPR:$src1),
(TyQ (MulOp (TyD DPR:$Vn),
- (TyD (NEONvduplane (TyD DPR_8:$Vm),
+ (TyD (ARMvduplane (TyD DPR_8:$Vm),
imm:$lane))))))]>;
// Long Intrinsic-Op vector operations with explicit extend (VABAL).
[(set (ResTy QPR:$Vd),
(ResTy (IntOp (ResTy QPR:$src1),
(OpTy DPR:$Vn),
- (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+ (OpTy (ARMvduplane (OpTy DPR_VFP2:$Vm),
imm:$lane)))))]>;
class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8,
InstrItinClass itin, string OpcodeStr, string Dt,
[(set (ResTy QPR:$Vd),
(ResTy (IntOp (ResTy QPR:$src1),
(OpTy DPR:$Vn),
- (OpTy (NEONvduplane (OpTy DPR_8:$Vm),
+ (OpTy (ARMvduplane (OpTy DPR_8:$Vm),
imm:$lane)))))]>;
// Narrowing 3-register intrinsics.
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set QPR:$Vd,
(TyQ (OpNode (TyD DPR:$Vn),
- (TyD (NEONvduplane (TyD DPR_VFP2:$Vm),imm:$lane)))))]>;
+ (TyD (ARMvduplane (TyD DPR_VFP2:$Vm),imm:$lane)))))]>;
class N3VLSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
InstrItinClass itin, string OpcodeStr, string Dt,
ValueType TyQ, ValueType TyD, SDNode OpNode>
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set QPR:$Vd,
(TyQ (OpNode (TyD DPR:$Vn),
- (TyD (NEONvduplane (TyD DPR_8:$Vm), imm:$lane)))))]>;
+ (TyD (ARMvduplane (TyD DPR_8:$Vm), imm:$lane)))))]>;
// Long 3-register operations with explicitly extended operands.
class N3VLExt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set (ResTy QPR:$Vd),
(ResTy (IntOp (OpTy DPR:$Vn),
- (OpTy (NEONvduplane (OpTy DPR_VFP2:$Vm),
+ (OpTy (ARMvduplane (OpTy DPR_VFP2:$Vm),
imm:$lane)))))]>;
class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8,
InstrItinClass itin, string OpcodeStr, string Dt,
NVMulSLFrm, itin, OpcodeStr, Dt, "$Vd, $Vn, $Vm$lane", "",
[(set (ResTy QPR:$Vd),
(ResTy (IntOp (OpTy DPR:$Vn),
- (OpTy (NEONvduplane (OpTy DPR_8:$Vm),
+ (OpTy (ARMvduplane (OpTy DPR_8:$Vm),
imm:$lane)))))]>;
// Wide 3-register operations.
let Predicates = [HasNEON] in {
def : Pat<(v8i16 (mul (v8i16 QPR:$src1),
- (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
+ (v8i16 (ARMvduplane (v8i16 QPR:$src2), imm:$lane)))),
(v8i16 (VMULslv8i16 (v8i16 QPR:$src1),
(v4i16 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
def : Pat<(v4i32 (mul (v4i32 QPR:$src1),
- (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
+ (v4i32 (ARMvduplane (v4i32 QPR:$src2), imm:$lane)))),
(v4i32 (VMULslv4i32 (v4i32 QPR:$src1),
(v2i32 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
- (v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))),
+ (v4f32 (ARMvduplane (v4f32 QPR:$src2), imm:$lane)))),
(v4f32 (VMULslfq (v4f32 QPR:$src1),
(v2f32 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
def : Pat<(v8f16 (fmul (v8f16 QPR:$src1),
- (v8f16 (NEONvduplane (v8f16 QPR:$src2), imm:$lane)))),
+ (v8f16 (ARMvduplane (v8f16 QPR:$src2), imm:$lane)))),
(v8f16 (VMULslhq(v8f16 QPR:$src1),
(v4f16 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
-def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+def : Pat<(v2f32 (fmul DPR:$Rn, (ARMvdup (f32 SPR:$Rm)))),
(VMULslfd DPR:$Rn,
(INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
(i32 0))>;
-def : Pat<(v4f16 (fmul DPR:$Rn, (NEONvdup (f16 HPR:$Rm)))),
+def : Pat<(v4f16 (fmul DPR:$Rn, (ARMvdup (f16 HPR:$Rm)))),
(VMULslhd DPR:$Rn,
(INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0),
(i32 0))>;
-def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+def : Pat<(v4f32 (fmul QPR:$Rn, (ARMvdup (f32 SPR:$Rm)))),
(VMULslfq QPR:$Rn,
(INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
(i32 0))>;
-def : Pat<(v8f16 (fmul QPR:$Rn, (NEONvdup (f16 HPR:$Rm)))),
+def : Pat<(v8f16 (fmul QPR:$Rn, (ARMvdup (f16 HPR:$Rm)))),
(VMULslhq QPR:$Rn,
(INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0),
(i32 0))>;
let Predicates = [HasNEON] in {
def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1),
- (v8i16 (NEONvduplane (v8i16 QPR:$src2),
+ (v8i16 (ARMvduplane (v8i16 QPR:$src2),
imm:$lane)))),
(v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1),
(v4i16 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1),
- (v4i32 (NEONvduplane (v4i32 QPR:$src2),
+ (v4i32 (ARMvduplane (v4i32 QPR:$src2),
imm:$lane)))),
(v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1),
(v2i32 (EXTRACT_SUBREG QPR:$src2,
let Predicates = [HasNEON] in {
def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1),
- (v8i16 (NEONvduplane (v8i16 QPR:$src2),
+ (v8i16 (ARMvduplane (v8i16 QPR:$src2),
imm:$lane)))),
(v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1),
(v4i16 (EXTRACT_SUBREG QPR:$src2,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
- (v4i32 (NEONvduplane (v4i32 QPR:$src2),
+ (v4i32 (ARMvduplane (v4i32 QPR:$src2),
imm:$lane)))),
(v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1),
(v2i32 (EXTRACT_SUBREG QPR:$src2,
let Predicates = [HasNEON] in {
def : Pat<(v8i16 (add (v8i16 QPR:$src1),
(mul (v8i16 QPR:$src2),
- (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+ (v8i16 (ARMvduplane (v8i16 QPR:$src3), imm:$lane))))),
(v8i16 (VMLAslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2),
(v4i16 (EXTRACT_SUBREG QPR:$src3,
(DSubReg_i16_reg imm:$lane))),
def : Pat<(v4i32 (add (v4i32 QPR:$src1),
(mul (v4i32 QPR:$src2),
- (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+ (v4i32 (ARMvduplane (v4i32 QPR:$src3), imm:$lane))))),
(v4i32 (VMLAslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2),
(v2i32 (EXTRACT_SUBREG QPR:$src3,
(DSubReg_i32_reg imm:$lane))),
def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1),
(fmul_su (v4f32 QPR:$src2),
- (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+ (v4f32 (ARMvduplane (v4f32 QPR:$src3), imm:$lane))))),
(v4f32 (VMLAslfq (v4f32 QPR:$src1),
(v4f32 QPR:$src2),
(v2f32 (EXTRACT_SUBREG QPR:$src3,
(v4i16 DPR:$src1),
(v4i16 (int_arm_neon_vqrdmulh
(v4i16 DPR:$Vn),
- (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+ (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
imm:$lane)))))),
(v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm,
imm:$lane))>;
(v2i32 DPR:$src1),
(v2i32 (int_arm_neon_vqrdmulh
(v2i32 DPR:$Vn),
- (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+ (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
imm:$lane)))))),
(v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
imm:$lane))>;
(v8i16 QPR:$src1),
(v8i16 (int_arm_neon_vqrdmulh
(v8i16 QPR:$src2),
- (v8i16 (NEONvduplane (v8i16 QPR:$src3),
+ (v8i16 (ARMvduplane (v8i16 QPR:$src3),
imm:$lane)))))),
(v8i16 (VQRDMLAHslv8i16 (v8i16 QPR:$src1),
(v8i16 QPR:$src2),
(v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqrdmulh
(v4i32 QPR:$src2),
- (v4i32 (NEONvduplane (v4i32 QPR:$src3),
+ (v4i32 (ARMvduplane (v4i32 QPR:$src3),
imm:$lane)))))),
(v4i32 (VQRDMLAHslv4i32 (v4i32 QPR:$src1),
(v4i32 QPR:$src2),
(v4i16 DPR:$src1),
(v4i16 (int_arm_neon_vqrdmulh
(v4i16 DPR:$Vn),
- (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+ (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
imm:$lane)))))),
(v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>;
def : Pat<(v2i32 (int_arm_neon_vqsubs
(v2i32 DPR:$src1),
(v2i32 (int_arm_neon_vqrdmulh
(v2i32 DPR:$Vn),
- (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+ (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
imm:$lane)))))),
(v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
imm:$lane))>;
(v8i16 QPR:$src1),
(v8i16 (int_arm_neon_vqrdmulh
(v8i16 QPR:$src2),
- (v8i16 (NEONvduplane (v8i16 QPR:$src3),
+ (v8i16 (ARMvduplane (v8i16 QPR:$src3),
imm:$lane)))))),
(v8i16 (VQRDMLSHslv8i16 (v8i16 QPR:$src1),
(v8i16 QPR:$src2),
(v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqrdmulh
(v4i32 QPR:$src2),
- (v4i32 (NEONvduplane (v4i32 QPR:$src3),
+ (v4i32 (ARMvduplane (v4i32 QPR:$src3),
imm:$lane)))))),
(v4i32 (VQRDMLSHslv4i32 (v4i32 QPR:$src1),
(v4i32 QPR:$src2),
(VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
- (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+ (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
imm:$lane)))))),
(VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
(v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
- (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+ (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
imm:$lane)))))),
(VQDMLALslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
}
let Predicates = [HasNEON] in {
def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
(mul (v8i16 QPR:$src2),
- (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+ (v8i16 (ARMvduplane (v8i16 QPR:$src3), imm:$lane))))),
(v8i16 (VMLSslv8i16 (v8i16 QPR:$src1), (v8i16 QPR:$src2),
(v4i16 (EXTRACT_SUBREG QPR:$src3,
(DSubReg_i16_reg imm:$lane))),
def : Pat<(v4i32 (sub (v4i32 QPR:$src1),
(mul (v4i32 QPR:$src2),
- (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+ (v4i32 (ARMvduplane (v4i32 QPR:$src3), imm:$lane))))),
(v4i32 (VMLSslv4i32 (v4i32 QPR:$src1), (v4i32 QPR:$src2),
(v2i32 (EXTRACT_SUBREG QPR:$src3,
(DSubReg_i32_reg imm:$lane))),
def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1),
(fmul_su (v4f32 QPR:$src2),
- (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+ (v4f32 (ARMvduplane (v4f32 QPR:$src3), imm:$lane))))),
(v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2),
(v2f32 (EXTRACT_SUBREG QPR:$src3,
(DSubReg_i32_reg imm:$lane))),
(VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
(v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
- (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+ (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
imm:$lane)))))),
(VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
(v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
- (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+ (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
imm:$lane)))))),
(VQDMLSLslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
}
(AccumType (OpNode (AccumType Ty:$Vd),
(InputType Ty:$Vn),
(InputType (bitconvert (AccumType
- (NEONvduplane (AccumType Ty:$Vm),
+ (ARMvduplane (AccumType Ty:$Vm),
VectorIndex32:$lane)))))),
(!cast<Instruction>(NAME) Ty:$Vd, Ty:$Vn, RHS, VectorIndex32:$lane)>;
}
def VGETLNs8 : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?},
(outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane),
IIC_VMOVSI, "vmov", "s8", "$R, $V$lane",
- [(set GPR:$R, (NEONvgetlanes (v8i8 DPR:$V),
+ [(set GPR:$R, (ARMvgetlanes (v8i8 DPR:$V),
imm:$lane))]> {
let Inst{21} = lane{2};
let Inst{6-5} = lane{1-0};
def VGETLNs16 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, {?,1},
(outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane),
IIC_VMOVSI, "vmov", "s16", "$R, $V$lane",
- [(set GPR:$R, (NEONvgetlanes (v4i16 DPR:$V),
+ [(set GPR:$R, (ARMvgetlanes (v4i16 DPR:$V),
imm:$lane))]> {
let Inst{21} = lane{1};
let Inst{6} = lane{0};
def VGETLNu8 : NVGetLane<{1,1,1,0,1,1,?,1}, 0b1011, {?,?},
(outs GPR:$R), (ins DPR:$V, VectorIndex8:$lane),
IIC_VMOVSI, "vmov", "u8", "$R, $V$lane",
- [(set GPR:$R, (NEONvgetlaneu (v8i8 DPR:$V),
+ [(set GPR:$R, (ARMvgetlaneu (v8i8 DPR:$V),
imm:$lane))]> {
let Inst{21} = lane{2};
let Inst{6-5} = lane{1-0};
def VGETLNu16 : NVGetLane<{1,1,1,0,1,0,?,1}, 0b1011, {?,1},
(outs GPR:$R), (ins DPR:$V, VectorIndex16:$lane),
IIC_VMOVSI, "vmov", "u16", "$R, $V$lane",
- [(set GPR:$R, (NEONvgetlaneu (v4i16 DPR:$V),
+ [(set GPR:$R, (ARMvgetlaneu (v4i16 DPR:$V),
imm:$lane))]> {
let Inst{21} = lane{1};
let Inst{6} = lane{0};
}
let Predicates = [HasNEON] in {
// def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td
-def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane),
+def : Pat<(ARMvgetlanes (v16i8 QPR:$src), imm:$lane),
(VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i8_reg imm:$lane))),
(SubReg_i8_lane imm:$lane))>;
-def : Pat<(NEONvgetlanes (v8i16 QPR:$src), imm:$lane),
+def : Pat<(ARMvgetlanes (v8i16 QPR:$src), imm:$lane),
(VGETLNs16 (v4i16 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane))>;
-def : Pat<(NEONvgetlaneu (v16i8 QPR:$src), imm:$lane),
+def : Pat<(ARMvgetlaneu (v16i8 QPR:$src), imm:$lane),
(VGETLNu8 (v8i8 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i8_reg imm:$lane))),
(SubReg_i8_lane imm:$lane))>;
-def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane),
+def : Pat<(ARMvgetlaneu (v8i16 QPR:$src), imm:$lane),
(VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane))>;
class VDUPD<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
: NVDup<opcod1, 0b1011, opcod3, (outs DPR:$V), (ins GPR:$R),
IIC_VMOVIS, "vdup", Dt, "$V, $R",
- [(set DPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>;
+ [(set DPR:$V, (Ty (ARMvdup (i32 GPR:$R))))]>;
class VDUPQ<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
: NVDup<opcod1, 0b1011, opcod3, (outs QPR:$V), (ins GPR:$R),
IIC_VMOVIS, "vdup", Dt, "$V, $R",
- [(set QPR:$V, (Ty (NEONvdup (i32 GPR:$R))))]>;
+ [(set QPR:$V, (Ty (ARMvdup (i32 GPR:$R))))]>;
def VDUP8d : VDUPD<0b11101100, 0b00, "8", v8i8>;
def VDUP16d : VDUPD<0b11101000, 0b01, "16", v4i16>;
def VDUP16q : VDUPQ<0b11101010, 0b01, "16", v8i16>;
def VDUP32q : VDUPQ<0b11101010, 0b00, "32", v4i32>;
-// NEONvdup patterns for uarchs with fast VDUP.32.
-def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>,
+// ARMvdup patterns for uarchs with fast VDUP.32.
+def : Pat<(v2f32 (ARMvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>,
Requires<[HasNEON,HasFastVDUP32]>;
-def : Pat<(v4f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>,
+def : Pat<(v4f32 (ARMvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>,
Requires<[HasNEON]>;
-// NEONvdup patterns for uarchs with slow VDUP.32 - use VMOVDRR instead.
-def : Pat<(v2i32 (NEONvdup (i32 GPR:$R))), (VMOVDRR GPR:$R, GPR:$R)>,
+// ARMvdup patterns for uarchs with slow VDUP.32 - use VMOVDRR instead.
+def : Pat<(v2i32 (ARMvdup (i32 GPR:$R))), (VMOVDRR GPR:$R, GPR:$R)>,
Requires<[HasNEON,HasSlowVDUP32]>;
-def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VMOVDRR GPR:$R, GPR:$R)>,
+def : Pat<(v2f32 (ARMvdup (f32 (bitconvert GPR:$R)))), (VMOVDRR GPR:$R, GPR:$R)>,
Requires<[HasNEON,HasSlowVDUP32]>;
// VDUP : Vector Duplicate Lane (from scalar to all elements)
ValueType Ty, Operand IdxTy>
: NVDupLane<op19_16, 0, (outs DPR:$Vd), (ins DPR:$Vm, IdxTy:$lane),
IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm$lane",
- [(set DPR:$Vd, (Ty (NEONvduplane (Ty DPR:$Vm), imm:$lane)))]>;
+ [(set DPR:$Vd, (Ty (ARMvduplane (Ty DPR:$Vm), imm:$lane)))]>;
class VDUPLNQ<bits<4> op19_16, string OpcodeStr, string Dt,
ValueType ResTy, ValueType OpTy, Operand IdxTy>
: NVDupLane<op19_16, 1, (outs QPR:$Vd), (ins DPR:$Vm, IdxTy:$lane),
IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm$lane",
- [(set QPR:$Vd, (ResTy (NEONvduplane (OpTy DPR:$Vm),
+ [(set QPR:$Vd, (ResTy (ARMvduplane (OpTy DPR:$Vm),
VectorIndex32:$lane)))]>;
// Inst{19-16} is partially specified depending on the element size.
}
let Predicates = [HasNEON] in {
-def : Pat<(v4f16 (NEONvduplane (v4f16 DPR:$Vm), imm:$lane)),
+def : Pat<(v4f16 (ARMvduplane (v4f16 DPR:$Vm), imm:$lane)),
(VDUPLN32d DPR:$Vm, imm:$lane)>;
-def : Pat<(v2f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)),
+def : Pat<(v2f32 (ARMvduplane (v2f32 DPR:$Vm), imm:$lane)),
(VDUPLN32d DPR:$Vm, imm:$lane)>;
-def : Pat<(v4f32 (NEONvduplane (v2f32 DPR:$Vm), imm:$lane)),
+def : Pat<(v4f32 (ARMvduplane (v2f32 DPR:$Vm), imm:$lane)),
(VDUPLN32q DPR:$Vm, imm:$lane)>;
-def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)),
+def : Pat<(v16i8 (ARMvduplane (v16i8 QPR:$src), imm:$lane)),
(v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i8_reg imm:$lane))),
(SubReg_i8_lane imm:$lane)))>;
-def : Pat<(v8i16 (NEONvduplane (v8i16 QPR:$src), imm:$lane)),
+def : Pat<(v8i16 (ARMvduplane (v8i16 QPR:$src), imm:$lane)),
(v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
-def : Pat<(v8f16 (NEONvduplane (v8f16 QPR:$src), imm:$lane)),
+def : Pat<(v8f16 (ARMvduplane (v8f16 QPR:$src), imm:$lane)),
(v8f16 (VDUPLN16q (v4f16 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i16_reg imm:$lane))),
(SubReg_i16_lane imm:$lane)))>;
-def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)),
+def : Pat<(v4i32 (ARMvduplane (v4i32 QPR:$src), imm:$lane)),
(v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
-def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)),
+def : Pat<(v4f32 (ARMvduplane (v4f32 QPR:$src), imm:$lane)),
(v4f32 (VDUPLN32q (v2f32 (EXTRACT_SUBREG QPR:$src,
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
-def : Pat<(v4f16 (NEONvdup HPR:$src)),
+def : Pat<(v4f16 (ARMvdup HPR:$src)),
(v4f16 (VDUPLN16d (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)),
HPR:$src, ssub_0), (i32 0)))>;
-def : Pat<(v2f32 (NEONvdup (f32 SPR:$src))),
+def : Pat<(v2f32 (ARMvdup (f32 SPR:$src))),
(v2f32 (VDUPLN32d (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
SPR:$src, ssub_0), (i32 0)))>;
-def : Pat<(v4f32 (NEONvdup (f32 SPR:$src))),
+def : Pat<(v4f32 (ARMvdup (f32 SPR:$src))),
(v4f32 (VDUPLN32q (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
SPR:$src, ssub_0), (i32 0)))>;
-def : Pat<(v8f16 (NEONvdup HPR:$src)),
+def : Pat<(v8f16 (ARMvdup HPR:$src)),
(v8f16 (VDUPLN16q (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)),
HPR:$src, ssub_0), (i32 0)))>;
}
: N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$Vd),
(ins DPR:$Vm), IIC_VMOVD,
OpcodeStr, Dt, "$Vd, $Vm", "",
- [(set DPR:$Vd, (Ty (NEONvrev64 (Ty DPR:$Vm))))]>;
+ [(set DPR:$Vd, (Ty (ARMvrev64 (Ty DPR:$Vm))))]>;
class VREV64Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
: N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$Vd),
(ins QPR:$Vm), IIC_VMOVQ,
OpcodeStr, Dt, "$Vd, $Vm", "",
- [(set QPR:$Vd, (Ty (NEONvrev64 (Ty QPR:$Vm))))]>;
+ [(set QPR:$Vd, (Ty (ARMvrev64 (Ty QPR:$Vm))))]>;
def VREV64d8 : VREV64D<0b00, "vrev64", "8", v8i8>;
def VREV64d16 : VREV64D<0b01, "vrev64", "16", v4i16>;
def VREV64d32 : VREV64D<0b10, "vrev64", "32", v2i32>;
let Predicates = [HasNEON] in {
-def : Pat<(v2f32 (NEONvrev64 (v2f32 DPR:$Vm))), (VREV64d32 DPR:$Vm)>;
+def : Pat<(v2f32 (ARMvrev64 (v2f32 DPR:$Vm))), (VREV64d32 DPR:$Vm)>;
}
def VREV64q8 : VREV64Q<0b00, "vrev64", "8", v16i8>;
def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>;
let Predicates = [HasNEON] in {
-def : Pat<(v4f32 (NEONvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>;
-def : Pat<(v8f16 (NEONvrev64 (v8f16 QPR:$Vm))), (VREV64q16 QPR:$Vm)>;
-def : Pat<(v4f16 (NEONvrev64 (v4f16 DPR:$Vm))), (VREV64d16 DPR:$Vm)>;
+def : Pat<(v4f32 (ARMvrev64 (v4f32 QPR:$Vm))), (VREV64q32 QPR:$Vm)>;
+def : Pat<(v8f16 (ARMvrev64 (v8f16 QPR:$Vm))), (VREV64q16 QPR:$Vm)>;
+def : Pat<(v4f16 (ARMvrev64 (v4f16 DPR:$Vm))), (VREV64d16 DPR:$Vm)>;
}
// VREV32 : Vector Reverse elements within 32-bit words
: N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$Vd),
(ins DPR:$Vm), IIC_VMOVD,
OpcodeStr, Dt, "$Vd, $Vm", "",
- [(set DPR:$Vd, (Ty (NEONvrev32 (Ty DPR:$Vm))))]>;
+ [(set DPR:$Vd, (Ty (ARMvrev32 (Ty DPR:$Vm))))]>;
class VREV32Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
: N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$Vd),
(ins QPR:$Vm), IIC_VMOVQ,
OpcodeStr, Dt, "$Vd, $Vm", "",
- [(set QPR:$Vd, (Ty (NEONvrev32 (Ty QPR:$Vm))))]>;
+ [(set QPR:$Vd, (Ty (ARMvrev32 (Ty QPR:$Vm))))]>;
def VREV32d8 : VREV32D<0b00, "vrev32", "8", v8i8>;
def VREV32d16 : VREV32D<0b01, "vrev32", "16", v4i16>;
: N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$Vd),
(ins DPR:$Vm), IIC_VMOVD,
OpcodeStr, Dt, "$Vd, $Vm", "",
- [(set DPR:$Vd, (Ty (NEONvrev16 (Ty DPR:$Vm))))]>;
+ [(set DPR:$Vd, (Ty (ARMvrev16 (Ty DPR:$Vm))))]>;
class VREV16Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
: N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$Vd),
(ins QPR:$Vm), IIC_VMOVQ,
OpcodeStr, Dt, "$Vd, $Vm", "",
- [(set QPR:$Vd, (Ty (NEONvrev16 (Ty QPR:$Vm))))]>;
+ [(set QPR:$Vd, (Ty (ARMvrev16 (Ty QPR:$Vm))))]>;
def VREV16d8 : VREV16D<0b00, "vrev16", "8", v8i8>;
def VREV16q8 : VREV16Q<0b00, "vrev16", "8", v16i8>;
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
define arm_aapcs_vfpcc <2 x i64> @bitcast_i64_i64(<2 x i64> %src) {
; CHECK-LABEL: bitcast_i64_i64:
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP
+
+define arm_aapcs_vfpcc <4 x i32> @shuffle1_i32(<4 x i32> %src) {
+; CHECK-LABEL: shuffle1_i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.f32 s4, s3
+; CHECK-NEXT: vmov.f32 s5, s2
+; CHECK-NEXT: vmov.f32 s6, s1
+; CHECK-NEXT: vmov.f32 s7, s0
+; CHECK-NEXT: vmov q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @shuffle2_i32(<4 x i32> %src) {
+; CHECK-LABEL: shuffle2_i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @shuffle3_i32(<4 x i32> %src) {
+; CHECK-LABEL: shuffle3_i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.f32 s4, s3
+; CHECK-NEXT: vmov.f32 s5, s1
+; CHECK-NEXT: vmov.f32 s6, s2
+; CHECK-NEXT: vmov.f32 s7, s0
+; CHECK-NEXT: vmov q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
+ ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @shuffle5_i32(<4 x i32> %src) {
+; CHECK-LABEL: shuffle5_i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrev64.32 q0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <4 x i32> @shuffle6_i32(<4 x i32> %src) {
+; CHECK-LABEL: shuffle6_i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 3>
+ ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @shuffle1_i16(<8 x i16> %src) {
+; CHECK-LABEL: shuffle1_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: vmov.u16 r0, q0[7]
+; CHECK-NEXT: vmov.16 q0[0], r0
+; CHECK-NEXT: vmov.u16 r0, q1[6]
+; CHECK-NEXT: vmov.16 q0[1], r0
+; CHECK-NEXT: vmov.u16 r0, q1[5]
+; CHECK-NEXT: vmov.16 q0[2], r0
+; CHECK-NEXT: vmov.u16 r0, q1[4]
+; CHECK-NEXT: vmov.16 q0[3], r0
+; CHECK-NEXT: vmov.u16 r0, q1[3]
+; CHECK-NEXT: vmov.16 q0[4], r0
+; CHECK-NEXT: vmov.u16 r0, q1[2]
+; CHECK-NEXT: vmov.16 q0[5], r0
+; CHECK-NEXT: vmov.u16 r0, q1[1]
+; CHECK-NEXT: vmov.16 q0[6], r0
+; CHECK-NEXT: vmov.u16 r0, q1[0]
+; CHECK-NEXT: vmov.16 q0[7], r0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+ ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @shuffle2_i16(<8 x i16> %src) {
+; CHECK-LABEL: shuffle2_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) {
+; CHECK-LABEL: shuffle3_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: vmov.u16 r0, q0[4]
+; CHECK-NEXT: vmov.16 q0[0], r0
+; CHECK-NEXT: vmov.u16 r0, q1[5]
+; CHECK-NEXT: vmov.16 q0[1], r0
+; CHECK-NEXT: vmov.u16 r0, q1[7]
+; CHECK-NEXT: vmov.16 q0[2], r0
+; CHECK-NEXT: vmov.u16 r0, q1[6]
+; CHECK-NEXT: vmov.16 q0[3], r0
+; CHECK-NEXT: vmov.u16 r0, q1[3]
+; CHECK-NEXT: vmov.16 q0[4], r0
+; CHECK-NEXT: vmov.u16 r0, q1[1]
+; CHECK-NEXT: vmov.16 q0[5], r0
+; CHECK-NEXT: vmov.u16 r0, q1[2]
+; CHECK-NEXT: vmov.16 q0[6], r0
+; CHECK-NEXT: vmov.u16 r0, q1[0]
+; CHECK-NEXT: vmov.16 q0[7], r0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
+ ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @shuffle5_i16(<8 x i16> %src) {
+; CHECK-LABEL: shuffle5_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrev64.16 q0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @shuffle6_i16(<8 x i16> %src) {
+; CHECK-LABEL: shuffle6_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrev32.16 q0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+ ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <16 x i8> @shuffle1_i8(<16 x i8> %src) {
+; CHECK-LABEL: shuffle1_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: vmov.u8 r0, q0[15]
+; CHECK-NEXT: vmov.8 q0[0], r0
+; CHECK-NEXT: vmov.u8 r0, q1[14]
+; CHECK-NEXT: vmov.8 q0[1], r0
+; CHECK-NEXT: vmov.u8 r0, q1[13]
+; CHECK-NEXT: vmov.8 q0[2], r0
+; CHECK-NEXT: vmov.u8 r0, q1[12]
+; CHECK-NEXT: vmov.8 q0[3], r0
+; CHECK-NEXT: vmov.u8 r0, q1[11]
+; CHECK-NEXT: vmov.8 q0[4], r0
+; CHECK-NEXT: vmov.u8 r0, q1[10]
+; CHECK-NEXT: vmov.8 q0[5], r0
+; CHECK-NEXT: vmov.u8 r0, q1[9]
+; CHECK-NEXT: vmov.8 q0[6], r0
+; CHECK-NEXT: vmov.u8 r0, q1[8]
+; CHECK-NEXT: vmov.8 q0[7], r0
+; CHECK-NEXT: vmov.u8 r0, q1[7]
+; CHECK-NEXT: vmov.8 q0[8], r0
+; CHECK-NEXT: vmov.u8 r0, q1[6]
+; CHECK-NEXT: vmov.8 q0[9], r0
+; CHECK-NEXT: vmov.u8 r0, q1[5]
+; CHECK-NEXT: vmov.8 q0[10], r0
+; CHECK-NEXT: vmov.u8 r0, q1[4]
+; CHECK-NEXT: vmov.8 q0[11], r0
+; CHECK-NEXT: vmov.u8 r0, q1[3]
+; CHECK-NEXT: vmov.8 q0[12], r0
+; CHECK-NEXT: vmov.u8 r0, q1[2]
+; CHECK-NEXT: vmov.8 q0[13], r0
+; CHECK-NEXT: vmov.u8 r0, q1[1]
+; CHECK-NEXT: vmov.8 q0[14], r0
+; CHECK-NEXT: vmov.u8 r0, q1[0]
+; CHECK-NEXT: vmov.8 q0[15], r0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+ ret <16 x i8> %out
+}
+
+define arm_aapcs_vfpcc <16 x i8> @shuffle2_i8(<16 x i8> %src) {
+; CHECK-LABEL: shuffle2_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i8> %out
+}
+
+define arm_aapcs_vfpcc <16 x i8> @shuffle3_i8(<16 x i8> %src) {
+; CHECK-LABEL: shuffle3_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: vmov.u8 r0, q0[4]
+; CHECK-NEXT: vmov.8 q0[0], r0
+; CHECK-NEXT: vmov.u8 r0, q1[5]
+; CHECK-NEXT: vmov.8 q0[1], r0
+; CHECK-NEXT: vmov.u8 r0, q1[15]
+; CHECK-NEXT: vmov.8 q0[2], r0
+; CHECK-NEXT: vmov.u8 r0, q1[7]
+; CHECK-NEXT: vmov.8 q0[3], r0
+; CHECK-NEXT: vmov.u8 r0, q1[14]
+; CHECK-NEXT: vmov.8 q0[4], r0
+; CHECK-NEXT: vmov.u8 r0, q1[9]
+; CHECK-NEXT: vmov.8 q0[5], r0
+; CHECK-NEXT: vmov.u8 r0, q1[6]
+; CHECK-NEXT: vmov.8 q0[6], r0
+; CHECK-NEXT: vmov.u8 r0, q1[3]
+; CHECK-NEXT: vmov.8 q0[7], r0
+; CHECK-NEXT: vmov.u8 r0, q1[10]
+; CHECK-NEXT: vmov.8 q0[8], r0
+; CHECK-NEXT: vmov.u8 r0, q1[12]
+; CHECK-NEXT: vmov.8 q0[9], r0
+; CHECK-NEXT: vmov.u8 r0, q1[1]
+; CHECK-NEXT: vmov.8 q0[10], r0
+; CHECK-NEXT: vmov.u8 r0, q1[13]
+; CHECK-NEXT: vmov.8 q0[11], r0
+; CHECK-NEXT: vmov.u8 r0, q1[2]
+; CHECK-NEXT: vmov.8 q0[12], r0
+; CHECK-NEXT: vmov.u8 r0, q1[8]
+; CHECK-NEXT: vmov.8 q0[13], r0
+; CHECK-NEXT: vmov.u8 r0, q1[0]
+; CHECK-NEXT: vmov.8 q0[14], r0
+; CHECK-NEXT: vmov.u8 r0, q1[11]
+; CHECK-NEXT: vmov.8 q0[15], r0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 15, i32 7, i32 14, i32 9, i32 6, i32 3, i32 10, i32 12, i32 1, i32 13, i32 2, i32 8, i32 0, i32 11>
+ ret <16 x i8> %out
+}
+
+define arm_aapcs_vfpcc <16 x i8> @shuffle5_i8(<16 x i8> %src) {
+; CHECK-LABEL: shuffle5_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrev64.8 q0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+ ret <16 x i8> %out
+}
+
+define arm_aapcs_vfpcc <16 x i8> @shuffle6_i8(<16 x i8> %src) {
+; CHECK-LABEL: shuffle6_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrev32.8 q0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+ ret <16 x i8> %out
+}
+
+define arm_aapcs_vfpcc <16 x i8> @shuffle7_i8(<16 x i8> %src) {
+; CHECK-LABEL: shuffle7_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrev16.8 q0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+ ret <16 x i8> %out
+}
+
+define arm_aapcs_vfpcc <4 x float> @shuffle1_f32(<4 x float> %src) {
+; CHECK-LABEL: shuffle1_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.f32 s4, s3
+; CHECK-NEXT: vmov.f32 s5, s2
+; CHECK-NEXT: vmov.f32 s6, s1
+; CHECK-NEXT: vmov.f32 s7, s0
+; CHECK-NEXT: vmov q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ ret <4 x float> %out
+}
+
+define arm_aapcs_vfpcc <4 x float> @shuffle2_f32(<4 x float> %src) {
+; CHECK-LABEL: shuffle2_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x float> %out
+}
+
+define arm_aapcs_vfpcc <4 x float> @shuffle3_f32(<4 x float> %src) {
+; CHECK-LABEL: shuffle3_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.f32 s4, s3
+; CHECK-NEXT: vmov.f32 s5, s1
+; CHECK-NEXT: vmov.f32 s6, s2
+; CHECK-NEXT: vmov.f32 s7, s0
+; CHECK-NEXT: vmov q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
+ ret <4 x float> %out
+}
+
+define arm_aapcs_vfpcc <4 x float> @shuffle5_f32(<4 x float> %src) {
+; CHECK-LABEL: shuffle5_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrev64.32 q0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ ret <4 x float> %out
+}
+
+define arm_aapcs_vfpcc <8 x half> @shuffle1_f16(<8 x half> %src) {
+; CHECK-MVE-LABEL: shuffle1_f16:
+; CHECK-MVE: @ %bb.0: @ %entry
+; CHECK-MVE-NEXT: movs r2, #0
+; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
+; CHECK-MVE-NEXT: vdup.16 q1, r2
+; CHECK-MVE-NEXT: vmov.u16 r1, q0[6]
+; CHECK-MVE-NEXT: vmov.16 q1[0], r0
+; CHECK-MVE-NEXT: vmov.u16 r0, q0[5]
+; CHECK-MVE-NEXT: vmov.16 q1[1], r1
+; CHECK-MVE-NEXT: vmov.16 q1[2], r0
+; CHECK-MVE-NEXT: vmov.u16 r0, q0[4]
+; CHECK-MVE-NEXT: vmov.16 q1[3], r0
+; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
+; CHECK-MVE-NEXT: vmov.16 q1[4], r0
+; CHECK-MVE-NEXT: vmov.u16 r0, q0[2]
+; CHECK-MVE-NEXT: vmov.16 q1[5], r0
+; CHECK-MVE-NEXT: vmov.u16 r0, q0[1]
+; CHECK-MVE-NEXT: vmov.16 q1[6], r0
+; CHECK-MVE-NEXT: vmov.u16 r0, q0[0]
+; CHECK-MVE-NEXT: vmov.16 q1[7], r0
+; CHECK-MVE-NEXT: vmov q0, q1
+; CHECK-MVE-NEXT: bx lr
+;
+; CHECK-MVEFP-LABEL: shuffle1_f16:
+; CHECK-MVEFP: @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[7]
+; CHECK-MVEFP-NEXT: vmov.u16 r1, q0[6]
+; CHECK-MVEFP-NEXT: vmov.16 q1[0], r0
+; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[5]
+; CHECK-MVEFP-NEXT: vmov.16 q1[1], r1
+; CHECK-MVEFP-NEXT: vmov.16 q1[2], r0
+; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[4]
+; CHECK-MVEFP-NEXT: vmov.16 q1[3], r0
+; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[3]
+; CHECK-MVEFP-NEXT: vmov.16 q1[4], r0
+; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[2]
+; CHECK-MVEFP-NEXT: vmov.16 q1[5], r0
+; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[1]
+; CHECK-MVEFP-NEXT: vmov.16 q1[6], r0
+; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[0]
+; CHECK-MVEFP-NEXT: vmov.16 q1[7], r0
+; CHECK-MVEFP-NEXT: vmov q0, q1
+; CHECK-MVEFP-NEXT: bx lr
+entry:
+ %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+ ret <8 x half> %out
+}
+
+define arm_aapcs_vfpcc <8 x half> @shuffle2_f16(<8 x half> %src) {
+; CHECK-LABEL: shuffle2_f16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x half> %out
+}
+
+define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) {
+; CHECK-MVE-LABEL: shuffle3_f16:
+; CHECK-MVE: @ %bb.0: @ %entry
+; CHECK-MVE-NEXT: movs r2, #0
+; CHECK-MVE-NEXT: vmov.u16 r0, q0[4]
+; CHECK-MVE-NEXT: vdup.16 q1, r2
+; CHECK-MVE-NEXT: vmov.u16 r1, q0[5]
+; CHECK-MVE-NEXT: vmov.16 q1[0], r0
+; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
+; CHECK-MVE-NEXT: vmov.16 q1[1], r1
+; CHECK-MVE-NEXT: vmov.16 q1[2], r0
+; CHECK-MVE-NEXT: vmov.u16 r0, q0[6]
+; CHECK-MVE-NEXT: vmov.16 q1[3], r0
+; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
+; CHECK-MVE-NEXT: vmov.16 q1[4], r0
+; CHECK-MVE-NEXT: vmov.u16 r0, q0[1]
+; CHECK-MVE-NEXT: vmov.16 q1[5], r0
+; CHECK-MVE-NEXT: vmov.u16 r0, q0[2]
+; CHECK-MVE-NEXT: vmov.16 q1[6], r0
+; CHECK-MVE-NEXT: vmov.u16 r0, q0[0]
+; CHECK-MVE-NEXT: vmov.16 q1[7], r0
+; CHECK-MVE-NEXT: vmov q0, q1
+; CHECK-MVE-NEXT: bx lr
+;
+; CHECK-MVEFP-LABEL: shuffle3_f16:
+; CHECK-MVEFP: @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[4]
+; CHECK-MVEFP-NEXT: vmov.u16 r1, q0[5]
+; CHECK-MVEFP-NEXT: vmov.16 q1[0], r0
+; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[7]
+; CHECK-MVEFP-NEXT: vmov.16 q1[1], r1
+; CHECK-MVEFP-NEXT: vmov.16 q1[2], r0
+; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[6]
+; CHECK-MVEFP-NEXT: vmov.16 q1[3], r0
+; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[3]
+; CHECK-MVEFP-NEXT: vmov.16 q1[4], r0
+; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[1]
+; CHECK-MVEFP-NEXT: vmov.16 q1[5], r0
+; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[2]
+; CHECK-MVEFP-NEXT: vmov.16 q1[6], r0
+; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[0]
+; CHECK-MVEFP-NEXT: vmov.16 q1[7], r0
+; CHECK-MVEFP-NEXT: vmov q0, q1
+; CHECK-MVEFP-NEXT: bx lr
+entry:
+ %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
+ ret <8 x half> %out
+}
+
+define arm_aapcs_vfpcc <8 x half> @shuffle5_f16(<8 x half> %src) {
+; CHECK-LABEL: shuffle5_f16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrev64.16 q0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ ret <8 x half> %out
+}
+
+define arm_aapcs_vfpcc <8 x half> @shuffle6_f16(<8 x half> %src) {
+; CHECK-LABEL: shuffle6_f16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vrev32.16 q0, q0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+ ret <8 x half> %out
+}
+
+
+define arm_aapcs_vfpcc <4 x i32> @insert_i32(i32 %a) {
+; CHECK-LABEL: insert_i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.32 q0[0], r0
+; CHECK-NEXT: bx lr
+entry:
+ %res = insertelement <4 x i32> undef, i32 %a, i32 0
+ ret <4 x i32> %res
+}
+
+define arm_aapcs_vfpcc <8 x i16> @insert_i16(i16 %a) {
+; CHECK-LABEL: insert_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.16 q0[0], r0
+; CHECK-NEXT: bx lr
+entry:
+ %res = insertelement <8 x i16> undef, i16 %a, i32 0
+ ret <8 x i16> %res
+}
+
+define arm_aapcs_vfpcc <16 x i8> @insert_i8(i8 %a) {
+; CHECK-LABEL: insert_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.8 q0[0], r0
+; CHECK-NEXT: bx lr
+entry:
+ %res = insertelement <16 x i8> undef, i8 %a, i32 0
+ ret <16 x i8> %res
+}
+
+define arm_aapcs_vfpcc <4 x float> @insert_f32(float %a) {
+; CHECK-LABEL: insert_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: @ kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT: bx lr
+entry:
+ %res = insertelement <4 x float> undef, float %a, i32 0
+ ret <4 x float> %res
+}
+
+; TODO: Calling convention needs fixing to pass half types directly to functions
+define arm_aapcs_vfpcc <8 x half> @insert_f16(half *%aa) {
+; CHECK-LABEL: insert_f16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldr.16 s0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %a = load half, half* %aa
+ %res = insertelement <8 x half> undef, half %a, i32 0
+ ret <8 x half> %res
+}
+
+define arm_aapcs_vfpcc i64 @scalar_to_vector_i32(<8 x i16> %v) {
+; CHECK-LABEL: scalar_to_vector_i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .pad #8
+; CHECK-NEXT: sub sp, #8
+; CHECK-NEXT: movs r0, #7
+; CHECK-NEXT: movs r1, #1
+; CHECK-NEXT: strh.w r0, [sp, #2]
+; CHECK-NEXT: vmov.u16 r0, q0[0]
+; CHECK-NEXT: strh.w r0, [sp]
+; CHECK-NEXT: movt r1, #9
+; CHECK-NEXT: ldr r0, [sp]
+; CHECK-NEXT: add sp, #8
+; CHECK-NEXT: bx lr
+entry:
+ %f = shufflevector <8 x i16> %v, <8 x i16> <i16 undef, i16 7, i16 1, i16 9, i16 undef, i16 undef, i16 undef, i16 undef>, <4 x i32> <i32 0, i32 9, i32 10, i32 11>
+ %0 = bitcast <4 x i16> %f to i64
+ ret i64 %0
+}
+
+
+define arm_aapcs_vfpcc i32 @extract_i32_0(<4 x i32> %a) {
+; CHECK-LABEL: extract_i32_0:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov r0, s0
+; CHECK-NEXT: bx lr
+entry:
+ %res = extractelement <4 x i32> %a, i32 0
+ ret i32 %res
+}
+
+define arm_aapcs_vfpcc i32 @extract_i32_3(<4 x i32> %a) {
+; CHECK-LABEL: extract_i32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov r0, s3
+; CHECK-NEXT: bx lr
+entry:
+ %res = extractelement <4 x i32> %a, i32 3
+ ret i32 %res
+}
+
+define arm_aapcs_vfpcc i16 @extract_i16_0(<8 x i16> %a) {
+; CHECK-LABEL: extract_i16_0:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.u16 r0, q0[0]
+; CHECK-NEXT: bx lr
+entry:
+ %res = extractelement <8 x i16> %a, i32 0
+ ret i16 %res
+}
+
+define arm_aapcs_vfpcc i16 @extract_i16_3(<8 x i16> %a) {
+; CHECK-LABEL: extract_i16_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.u16 r0, q0[3]
+; CHECK-NEXT: bx lr
+entry:
+ %res = extractelement <8 x i16> %a, i32 3
+ ret i16 %res
+}
+
+define arm_aapcs_vfpcc i8 @extract_i8_0(<16 x i8> %a) {
+; CHECK-LABEL: extract_i8_0:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.u8 r0, q0[0]
+; CHECK-NEXT: bx lr
+entry:
+ %res = extractelement <16 x i8> %a, i32 0
+ ret i8 %res
+}
+
+define arm_aapcs_vfpcc i8 @extract_i8_3(<16 x i8> %a) {
+; CHECK-LABEL: extract_i8_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.u8 r0, q0[3]
+; CHECK-NEXT: bx lr
+entry:
+ %res = extractelement <16 x i8> %a, i32 3
+ ret i8 %res
+}
+
+define arm_aapcs_vfpcc float @extract_f32_0(<4 x float> %a) {
+; CHECK-LABEL: extract_f32_0:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: @ kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT: bx lr
+entry:
+ %res = extractelement <4 x float> %a, i32 0
+ ret float %res
+}
+
+define arm_aapcs_vfpcc float @extract_f32_3(<4 x float> %a) {
+; CHECK-LABEL: extract_f32_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.f32 s0, s3
+; CHECK-NEXT: bx lr
+entry:
+ %res = extractelement <4 x float> %a, i32 3
+ ret float %res
+}
+
+define arm_aapcs_vfpcc half @extract_f16_0(<8 x half> %a) {
+; CHECK-LABEL: extract_f16_0:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.u16 r1, q0[0]
+; CHECK-NEXT: vmov s0, r1
+; CHECK-NEXT: vstr.16 s0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %res = extractelement <8 x half> %a, i32 0
+ ret half %res
+}
+
+define arm_aapcs_vfpcc half @extract_f16_3(<8 x half> %a) {
+; CHECK-LABEL: extract_f16_3:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.u16 r1, q0[3]
+; CHECK-NEXT: vmov s0, r1
+; CHECK-NEXT: vstr.16 s0, [r0]
+; CHECK-NEXT: bx lr
+entry:
+ %res = extractelement <8 x half> %a, i32 3
+ ret half %res
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
+
+define arm_aapcs_vfpcc <4 x i32> @vdup_i32(i32 %src) {
+; CHECK-LABEL: vdup_i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vdup.32 q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = insertelement <4 x i32> undef, i32 %src, i32 0
+ %out = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vdup_i16(i16 %src) {
+; CHECK-LABEL: vdup_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vdup.16 q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = insertelement <8 x i16> undef, i16 %src, i32 0
+ %out = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vdup_i8(i8 %src) {
+; CHECK-LABEL: vdup_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vdup.8 q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = insertelement <16 x i8> undef, i8 %src, i32 0
+ %out = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
+ ret <16 x i8> %out
+}
+
+define arm_aapcs_vfpcc <4 x float> @vdup_f32_1(float %src) {
+; CHECK-LABEL: vdup_f32_1:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov r0, s0
+; CHECK-NEXT: vdup.32 q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = insertelement <4 x float> undef, float %src, i32 0
+ %out = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %out
+}
+
+define arm_aapcs_vfpcc <4 x float> @vdup_f32_2(float %src1, float %src2) {
+; CHECK-LABEL: vdup_f32_2:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vadd.f32 s0, s0, s1
+; CHECK-NEXT: vmov r0, s0
+; CHECK-NEXT: vdup.32 q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = fadd float %src1, %src2
+ %1 = insertelement <4 x float> undef, float %0, i32 0
+ %out = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %out
+}
+
+; TODO: Calling convention needs fixing to pass half types directly to functions
+define arm_aapcs_vfpcc <8 x half> @vdup_f16(half* %src1, half* %src2) {
+; CHECK-LABEL: vdup_f16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldr.16 s0, [r1]
+; CHECK-NEXT: vldr.16 s2, [r0]
+; CHECK-NEXT: vadd.f16 s0, s2, s0
+; CHECK-NEXT: vmov r0, s0
+; CHECK-NEXT: vdup.16 q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %0 = load half, half *%src1, align 2
+ %1 = load half, half *%src2, align 2
+ %2 = fadd half %0, %1
+ %3 = insertelement <8 x half> undef, half %2, i32 0
+ %out = shufflevector <8 x half> %3, <8 x half> undef, <8 x i32> zeroinitializer
+ ret <8 x half> %out
+}
+
+
+
+define arm_aapcs_vfpcc <4 x i32> @vduplane_i32(<4 x i32> %src) {
+; CHECK-LABEL: vduplane_i32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.32 r0, q0[3]
+; CHECK-NEXT: vdup.32 q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <4 x i32> %src, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ ret <4 x i32> %out
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vduplane_i16(<8 x i16> %src) {
+; CHECK-LABEL: vduplane_i16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.u16 r0, q0[3]
+; CHECK-NEXT: vdup.16 q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ ret <8 x i16> %out
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vduplane_i8(<16 x i8> %src) {
+; CHECK-LABEL: vduplane_i8:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.u8 r0, q0[3]
+; CHECK-NEXT: vdup.8 q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ ret <16 x i8> %out
+}
+
+define arm_aapcs_vfpcc <4 x float> @vduplane_f32(<4 x float> %src) {
+; CHECK-LABEL: vduplane_f32:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.32 r0, q0[3]
+; CHECK-NEXT: vdup.32 q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <4 x float> %src, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+ ret <4 x float> %out
+}
+
+define arm_aapcs_vfpcc <8 x half> @vduplane_f16(<8 x half> %src) {
+; CHECK-LABEL: vduplane_f16:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.u16 r0, q0[3]
+; CHECK-NEXT: vdup.16 q0, r0
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ ret <8 x half> %out
+}