setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+
+ setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
+ setOperationAction(ISD::STORE, MVT::v2f32, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
// We want to legalize this to an f64 load rather than an i64 load on
// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
// store.
- setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
- setOperationAction(ISD::STORE, MVT::v2f32, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i16, Custom);
setOperationAction(ISD::STORE, MVT::v8i8, Custom);
TargetLowering::TypeWidenVector)
return SDValue();
- // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
- // and store it.
MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(),
StoreVT.getVectorNumElements() * 2);
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
DAG.getUNDEF(StoreVT));
- MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
- MVT CastVT = MVT::getVectorVT(StVT, 2);
- StoredVal = DAG.getBitcast(CastVT, StoredVal);
- StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
- DAG.getIntPtrConstant(0, dl));
- return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
- St->getMemOperand()->getFlags());
+ if (Subtarget.hasSSE2()) {
+ // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
+ // and store it.
+ MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
+ MVT CastVT = MVT::getVectorVT(StVT, 2);
+ StoredVal = DAG.getBitcast(CastVT, StoredVal);
+ StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
+ DAG.getIntPtrConstant(0, dl));
+
+ return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+ St->getPointerInfo(), St->getAlignment(),
+ St->getMemOperand()->getFlags());
+ }
+ assert(Subtarget.hasSSE1() && "Expected SSE");
+ SDVTList Tys = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
+ return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
+ St->getMemOperand());
}
// Lower vector extended loads using a shuffle. If SSSE3 is not available we
if (!ISD::isNON_EXTLoad(N))
return;
auto *Ld = cast<LoadSDNode>(N);
- MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
- SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(),
- Ld->getAlignment(),
- Ld->getMemOperand()->getFlags());
- SDValue Chain = Res.getValue(1);
- MVT WideVT = MVT::getVectorVT(LdVT, 2);
- Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
- MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
- VT.getVectorNumElements() * 2);
- Res = DAG.getBitcast(CastVT, Res);
+ if (Subtarget.hasSSE2()) {
+ MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
+ SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
+ Ld->getPointerInfo(), Ld->getAlignment(),
+ Ld->getMemOperand()->getFlags());
+ SDValue Chain = Res.getValue(1);
+ MVT WideVT = MVT::getVectorVT(LdVT, 2);
+ Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
+ MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements() * 2);
+ Res = DAG.getBitcast(CastVT, Res);
+ Results.push_back(Res);
+ Results.push_back(Chain);
+ return;
+ }
+ assert(Subtarget.hasSSE1() && "Expected SSE");
+ SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
+ SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
+ SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+ MVT::i64, Ld->getMemOperand());
Results.push_back(Res);
- Results.push_back(Chain);
+ Results.push_back(Res.getValue(1));
return;
}
}
// directly if we don't shuffle the lower element and we shuffle the upper
// (zero) elements within themselves.
if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
- (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
- unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
+ (cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() %
+ MaskEltSizeInBits) == 0) {
+ unsigned Scale =
+ cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() /
+ MaskEltSizeInBits;
ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
let Predicates = [HasAVX512] in {
// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
- def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
+ def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQZm addr:$src)>;
}
let Predicates = [HasVLX] in {
// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
- def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
+ def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQZ128m addr:$src)>;
- def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
+ def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQZ256m addr:$src)>;
}
let Predicates = [HasVLX, HasBWI] in {
(VMOVPQI2QIZrr VR128X:$dst, VR128X:$src), 0>;
let Predicates = [HasAVX512] in {
- def : Pat<(X86vextractstore (v2i64 VR128X:$src), addr:$dst),
+ def : Pat<(X86vextractstore64 (v2i64 VR128X:$src), addr:$dst),
(VMOVPQI2QIZmr addr:$dst, VR128X:$src)>;
}
// AVX-512 MOVSS, MOVSD
//===----------------------------------------------------------------------===//
-multiclass avx512_move_scalar<string asm, SDNode OpNode,
+multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
X86VectorVTInfo _> {
let Predicates = [HasAVX512, OptForSize] in
def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
let canFoldAsLoad = 1, isReMaterializable = 1 in {
def rm : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), (ins _.ScalarMemOp:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(set _.RC:$dst, (_.VT (X86vzload addr:$src)))],
+ [(set _.RC:$dst, (_.VT (vzload_frag addr:$src)))],
_.ExeDomain>, EVEX, Sched<[WriteFLoad]>;
// _alt version uses FR32/FR64 register class.
let isCodeGenOnly = 1 in
NotMemoryFoldable;
}
-defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
+defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, X86vzload32, f32x_info>,
VEX_LIG, XS, EVEX_CD8<32, CD8VT1>;
-defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>,
+defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>,
VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
// Represent the same patterns above but in the form they appear for
// 256-bit types
- def : Pat<(v8f32 (X86vzload addr:$src)),
+ def : Pat<(v8f32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
- def : Pat<(v4f64 (X86vzload addr:$src)),
+ def : Pat<(v4f64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
// Represent the same patterns above but in the form they appear for
// 512-bit types
- def : Pat<(v16f32 (X86vzload addr:$src)),
+ def : Pat<(v16f32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
- def : Pat<(v8f64 (X86vzload addr:$src)),
+ def : Pat<(v8f64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
}
// AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
(VMOVDI2PDIZrm addr:$src)>;
- def : Pat<(v4i32 (X86vzload addr:$src)),
+ def : Pat<(v4i32 (X86vzload32 addr:$src)),
(VMOVDI2PDIZrm addr:$src)>;
- def : Pat<(v8i32 (X86vzload addr:$src)),
+ def : Pat<(v8i32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
(VMOVZPQILo2PQIZrr VR128X:$src)>;
- def : Pat<(v2i64 (X86vzload addr:$src)),
+ def : Pat<(v2i64 (X86vzload64 addr:$src)),
(VMOVQI2PQIZrm addr:$src)>;
- def : Pat<(v4i64 (X86vzload addr:$src)),
+ def : Pat<(v4i64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
- def : Pat<(v16i32 (X86vzload addr:$src)),
+ def : Pat<(v16i32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
- def : Pat<(v8i64 (X86vzload addr:$src)),
+ def : Pat<(v8i64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
(VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
- def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (X86vzload addr:$src2))),
+ def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (X86vzload64 addr:$src2))),
(VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
// VMOVLPD patterns
- def : Pat<(v2f64 (X86Movsd VR128X:$src1, (X86vzload addr:$src2))),
+ def : Pat<(v2f64 (X86Movsd VR128X:$src1, (X86vzload64 addr:$src2))),
(VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
}
}
let Predicates = [HasDQI, HasVLX] in {
- def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload addr:$src))))),
+ def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
(VCVTPS2QQZ128rm addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
- (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload addr:$src)))),
+ (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
VR128X:$src0)),
(VCVTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
- (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload addr:$src)))),
+ (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
v2i64x_info.ImmAllZerosV)),
(VCVTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v2i64 (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload addr:$src))))),
+ def : Pat<(v2i64 (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
(VCVTPS2UQQZ128rm addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
- (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload addr:$src)))),
+ (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
VR128X:$src0)),
(VCVTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
- (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload addr:$src)))),
+ (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
v2i64x_info.ImmAllZerosV)),
(VCVTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v2i64 (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload addr:$src))))),
+ def : Pat<(v2i64 (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
(VCVTTPS2QQZ128rm addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
- (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload addr:$src)))),
+ (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
VR128X:$src0)),
(VCVTTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
- (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload addr:$src)))),
+ (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
v2i64x_info.ImmAllZerosV)),
(VCVTTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v2i64 (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload addr:$src))))),
+ def : Pat<(v2i64 (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
(VCVTTPS2UQQZ128rm addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
- (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload addr:$src)))),
+ (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
VR128X:$src0)),
(VCVTTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
- (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload addr:$src)))),
+ (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
v2i64x_info.ImmAllZerosV)),
(VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
}
}
let Predicates = [HasVLX] in {
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(VCVTDQ2PDZ128rm addr:$src)>;
def : Pat<(v2f64 (vselect VK2WM:$mask,
- (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))),
+ (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
VR128X:$src0)),
(VCVTDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
def : Pat<(v2f64 (vselect VK2WM:$mask,
- (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))),
+ (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
v2f64x_info.ImmAllZerosV)),
(VCVTDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
+ def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(VCVTUDQ2PDZ128rm addr:$src)>;
def : Pat<(v2f64 (vselect VK2WM:$mask,
- (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))),
+ (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
VR128X:$src0)),
(VCVTUDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
def : Pat<(v2f64 (vselect VK2WM:$mask,
- (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))),
+ (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
v2f64x_info.ImmAllZerosV)),
(VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
}
EVEX_CD8<32, CD8VH>;
// Pattern match vcvtph2ps of a scalar i64 load.
- def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(VCVTPH2PSZ128rm addr:$src)>;
def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
(v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
- def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
}
let Predicates = [HasVLX] in {
def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
- def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v4i32 addr:$src)))),
+ def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
- def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v4i32 addr:$src)))),
+ def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
- def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
}
let Predicates = [HasVLX] in {
def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
- def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v4i32 addr:$src)))),
+ def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
}
// 512-bit patterns
(VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
def : Pat<(v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
(VMOVDDUPZ128rm addr:$src)>;
-def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload addr:$src)))),
+def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
(VMOVDDUPZ128rm addr:$src)>;
def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL",
SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
-def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
- [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-def X86vextractstore : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore,
- [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def X86vzld : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisInt<0>, SDTCisInt<1>,
def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>;
def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>;
-def vzload_v4i32 : PatFrag<(ops node:$src),
- (bitconvert (v4i32 (X86vzload node:$src)))>;
-def vzload_v2i64 : PatFrag<(ops node:$src),
- (bitconvert (v2i64 (X86vzload node:$src)))>;
+def X86vzload32 : PatFrag<(ops node:$src),
+ (X86vzld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 4;
+}]>;
+
+def X86vzload64 : PatFrag<(ops node:$src),
+ (X86vzld node:$src), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
+}]>;
+
+def X86vextractstore64 : PatFrag<(ops node:$val, node:$ptr),
+ (X86vextractst node:$val, node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
+}]>;
def fp32imm0 : PatLeaf<(f32 fpimm), [{
// Loading from memory automatically zeroing upper bits.
multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
- PatFrag mem_pat, string OpcodeStr, Domain d> {
+ PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
+ Domain d> {
def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (vt (X86vzload addr:$src)))], d>,
+ [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (vt (X86vzload addr:$src)))], d>,
+ [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
Sched<[WriteFLoad]>;
// _alt version uses FR32/FR64 register class.
SSEPackedDouble, "MOVSD", UseSSE2>, XD;
let canFoldAsLoad = 1, isReMaterializable = 1 in {
- defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, "movss",
+ defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
SSEPackedSingle>, XS;
- defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, "movsd",
+ defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
SSEPackedDouble>, XD;
}
// Represent the same patterns above but in the form they appear for
// 256-bit types
- def : Pat<(v8f32 (X86vzload addr:$src)),
+ def : Pat<(v8f32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
- def : Pat<(v4f64 (X86vzload addr:$src)),
+ def : Pat<(v4f64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
}
def : Pat<(X86Shufp (v4f32 (nonvolatile_load addr:$src2)), VR128:$src1,
(i8 -28)),
(MOVLPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
+ (MOVLPSrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(v4f32 (X86vzload64 addr:$src)),
+ (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
+ def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
+ (MOVLPSmr addr:$dst, VR128:$src)>;
}
//===----------------------------------------------------------------------===//
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
(VMOVHPDrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload addr:$src2))),
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
(VMOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(store (f64 (extractelt
(VMOVHPDmr addr:$dst, VR128:$src)>;
// MOVLPD patterns
- def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload addr:$src2))),
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
(VMOVLPDrm VR128:$src1, addr:$src2)>;
}
// No need for aligned load, we're only loading 64-bits.
def : Pat<(X86Movlhps VR128:$src1, (v4f32 (nonvolatile_load addr:$src2))),
(MOVHPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
+ (MOVHPSrm VR128:$src1, addr:$src2)>;
+
+ def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
+ addr:$dst),
+ (MOVHPSmr addr:$dst, VR128:$src)>;
}
let Predicates = [UseSSE2] in {
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
(MOVHPDrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload addr:$src2))),
+ def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
(MOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(store (f64 (extractelt
(MOVHPDmr addr:$dst, VR128:$src)>;
// MOVLPD patterns
- def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload addr:$src2))),
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
(MOVLPDrm VR128:$src1, addr:$src2)>;
}
// AVX register conversion intrinsics
let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(VCVTDQ2PDrm addr:$src)>;
} // Predicates = [HasAVX, NoVLX]
// SSE2 register conversion intrinsics
let Predicates = [UseSSE2] in {
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(CVTDQ2PDrm addr:$src)>;
} // Predicates = [UseSSE2]
// These instructions also write zeros in the high part of a 256-bit register.
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
(VMOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzload addr:$src)),
+ def : Pat<(v4i32 (X86vzload32 addr:$src)),
(VMOVDI2PDIrm addr:$src)>;
- def : Pat<(v8i32 (X86vzload addr:$src)),
+ def : Pat<(v8i32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
}
(MOV64toPQIrr GR64:$src)>;
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
(MOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzload addr:$src)),
+ def : Pat<(v4i32 (X86vzload32 addr:$src)),
(MOVDI2PDIrm addr:$src)>;
}
(MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
let Predicates = [UseAVX] in {
- def : Pat<(v2i64 (X86vzload addr:$src)),
+ def : Pat<(v2i64 (X86vzload64 addr:$src)),
(VMOVQI2PQIrm addr:$src)>;
- def : Pat<(v4i64 (X86vzload addr:$src)),
+ def : Pat<(v4i64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
- def : Pat<(X86vextractstore (v2i64 VR128:$src), addr:$dst),
+ def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
(VMOVPQI2QImr addr:$dst, VR128:$src)>;
}
let Predicates = [UseSSE2] in {
- def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
+ def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
- def : Pat<(X86vextractstore (v2i64 VR128:$src), addr:$dst),
+ def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
(MOVPQI2QImr addr:$dst, VR128:$src)>;
}
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
- def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))),
+ def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
}
// No need for aligned memory as this only loads 64-bits.
def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
(MOVDDUPrm addr:$src)>;
- def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))),
+ def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
(MOVDDUPrm addr:$src)>;
}
def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
- def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v8i32 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v4i64 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
(!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
- def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v4i64 (InVecOp (v8i16 (X86vzload64 addr:$src)))),
(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
}
}
(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
- def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BDrm) addr:$src)>;
- def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v4i32 addr:$src)))),
+ def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#BDrm) addr:$src)>;
def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
(!cast<I>(OpcPrefix#BDrm) addr:$src)>;
(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
- def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#WQrm) addr:$src)>;
- def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v4i32 addr:$src)))),
+ def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#WQrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
(!cast<I>(OpcPrefix#WQrm) addr:$src)>;
(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
- def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
WriteCvtPS2PHYSt>, VEX_L;
// Pattern match vcvtph2ps of a scalar i64 load.
- def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
+ def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(VCVTPH2PSrm addr:$src)>;
- def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
- (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+ def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16
+ (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(VCVTPH2PSrm addr:$src)>;
def : Pat<(store (f64 (extractelt
let Predicates = [HasAVX2, NoVLX] in {
// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
- def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
+ def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQrm addr:$src)>;
- def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
+ def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQYrm addr:$src)>;
def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
(VMOVDDUPrr VR128:$src)>;
def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
(VMOVDDUPrm addr:$src)>;
- def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload addr:$src)))),
+ def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
(VMOVDDUPrm addr:$src)>;
}
; X86-SSE-LABEL: test_mm_loadh_pi:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT: movss (%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x08]
-; X86-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movss 4(%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x50,0x04]
-; X86-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: shufps $0, %xmm1, %xmm2 # encoding: [0x0f,0xc6,0xd1,0x00]
-; X86-SSE-NEXT: # xmm2 = xmm2[0,0],xmm1[0,0]
-; X86-SSE-NEXT: shufps $36, %xmm2, %xmm0 # encoding: [0x0f,0xc6,0xc2,0x24]
-; X86-SSE-NEXT: # xmm0 = xmm0[0,1],xmm2[2,0]
+; X86-SSE-NEXT: movhps (%eax), %xmm0 # encoding: [0x0f,0x16,0x00]
+; X86-SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,1]
; X86-SSE-NEXT: retl # encoding: [0xc3]
;
; X86-AVX1-LABEL: test_mm_loadh_pi:
;
; X64-SSE-LABEL: test_mm_loadh_pi:
; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
-; X64-SSE-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xf8]
-; X64-SSE-NEXT: shrq $32, %rax # encoding: [0x48,0xc1,0xe8,0x20]
-; X64-SSE-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xfc]
-; X64-SSE-NEXT: movss -{{[0-9]+}}(%rsp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0xf8]
-; X64-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: movss -{{[0-9]+}}(%rsp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0xfc]
-; X64-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
-; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X64-SSE-NEXT: movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
-; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0]
+; X64-SSE-NEXT: movhps (%rdi), %xmm0 # encoding: [0x0f,0x16,0x07]
+; X64-SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,1]
; X64-SSE-NEXT: retq # encoding: [0xc3]
;
; X64-AVX1-LABEL: test_mm_loadh_pi:
; X86-SSE-LABEL: test_mm_loadl_pi:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT: movss (%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x10]
-; X86-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movss 4(%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x48,0x04]
-; X86-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: shufps $0, %xmm2, %xmm1 # encoding: [0x0f,0xc6,0xca,0x00]
-; X86-SSE-NEXT: # xmm1 = xmm1[0,0],xmm2[0,0]
-; X86-SSE-NEXT: shufps $226, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe2]
-; X86-SSE-NEXT: # xmm1 = xmm1[2,0],xmm0[2,3]
-; X86-SSE-NEXT: movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
+; X86-SSE-NEXT: movlps (%eax), %xmm0 # encoding: [0x0f,0x12,0x00]
+; X86-SSE-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
; X86-SSE-NEXT: retl # encoding: [0xc3]
;
; X86-AVX1-LABEL: test_mm_loadl_pi:
;
; X64-SSE-LABEL: test_mm_loadl_pi:
; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
-; X64-SSE-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xf8]
-; X64-SSE-NEXT: shrq $32, %rax # encoding: [0x48,0xc1,0xe8,0x20]
-; X64-SSE-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xfc]
-; X64-SSE-NEXT: movss -{{[0-9]+}}(%rsp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0xf8]
-; X64-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: movss -{{[0-9]+}}(%rsp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0xfc]
-; X64-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
-; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X64-SSE-NEXT: shufps $228, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe4]
-; X64-SSE-NEXT: # xmm1 = xmm1[0,1],xmm0[2,3]
-; X64-SSE-NEXT: movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
+; X64-SSE-NEXT: movlps (%rdi), %xmm0 # encoding: [0x0f,0x12,0x07]
+; X64-SSE-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
; X64-SSE-NEXT: retq # encoding: [0xc3]
;
; X64-AVX1-LABEL: test_mm_loadl_pi:
; X86-SSE-LABEL: test_mm_storeh_pi2:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT: movaps %xmm0, %xmm1 # encoding: [0x0f,0x28,0xc8]
-; X86-SSE-NEXT: movhlps %xmm0, %xmm1 # encoding: [0x0f,0x12,0xc8]
-; X86-SSE-NEXT: # xmm1 = xmm0[1],xmm1[1]
-; X86-SSE-NEXT: shufps $231, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0xe7]
-; X86-SSE-NEXT: # xmm0 = xmm0[3,1,2,3]
-; X86-SSE-NEXT: movss %xmm0, 4(%eax) # encoding: [0xf3,0x0f,0x11,0x40,0x04]
-; X86-SSE-NEXT: movss %xmm1, (%eax) # encoding: [0xf3,0x0f,0x11,0x08]
+; X86-SSE-NEXT: movhps %xmm0, (%eax) # encoding: [0x0f,0x17,0x00]
; X86-SSE-NEXT: retl # encoding: [0xc3]
;
; X86-AVX1-LABEL: test_mm_storeh_pi2:
;
; X64-SSE-LABEL: test_mm_storeh_pi2:
; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: movhlps %xmm0, %xmm0 # encoding: [0x0f,0x12,0xc0]
-; X64-SSE-NEXT: # xmm0 = xmm0[1,1]
-; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xe8]
-; X64-SSE-NEXT: movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
+; X64-SSE-NEXT: movhps %xmm0, (%rdi) # encoding: [0x0f,0x17,0x07]
; X64-SSE-NEXT: retq # encoding: [0xc3]
;
; X64-AVX1-LABEL: test_mm_storeh_pi2:
; X86-SSE-LABEL: test_mm_storel_pi2:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT: movss %xmm0, (%eax) # encoding: [0xf3,0x0f,0x11,0x00]
-; X86-SSE-NEXT: shufps $229, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0xe5]
-; X86-SSE-NEXT: # xmm0 = xmm0[1,1,2,3]
-; X86-SSE-NEXT: movss %xmm0, 4(%eax) # encoding: [0xf3,0x0f,0x11,0x40,0x04]
+; X86-SSE-NEXT: movlps %xmm0, (%eax) # encoding: [0x0f,0x13,0x00]
; X86-SSE-NEXT: retl # encoding: [0xc3]
;
; X86-AVX1-LABEL: test_mm_storel_pi2:
;
; X64-SSE-LABEL: test_mm_storel_pi2:
; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xe8]
-; X64-SSE-NEXT: movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
+; X64-SSE-NEXT: movlps %xmm0, (%rdi) # encoding: [0x0f,0x13,0x07]
; X64-SSE-NEXT: retq # encoding: [0xc3]
;
; X64-AVX1-LABEL: test_mm_storel_pi2:
define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE1-LABEL: insert_mem_lo_v4f32:
; SSE1: # %bb.0:
-; SSE1-NEXT: movq (%rdi), %rax
-; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
-; SSE1-NEXT: shrq $32, %rax
-; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
-; SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
-; SSE1-NEXT: movaps %xmm1, %xmm0
+; SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; SSE1-NEXT: retq
%a = load <2 x float>, <2 x float>* %ptr
%v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE1-LABEL: insert_mem_hi_v4f32:
; SSE1: # %bb.0:
-; SSE1-NEXT: movq (%rdi), %rax
-; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
-; SSE1-NEXT: shrq $32, %rax
-; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
-; SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE1-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE1-NEXT: retq
%a = load <2 x float>, <2 x float>* %ptr
%v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>