Movl, N->getOperand(0).getOperand(2));
}
+ // If this a vzmovl of a full vector load, replace it with a vzload, unless
+ // the load is volatile.
+ if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
+ ISD::isNormalLoad(N->getOperand(0).getNode())) {
+ LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
+ if (!LN->isVolatile()) {
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+ SDValue VZLoad =
+ DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
+ VT.getVectorElementType(),
+ LN->getPointerInfo(),
+ LN->getAlignment(),
+ MachineMemOperand::MOLoad);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 0), VZLoad.getValue(1));
+ return VZLoad;
+ }
+ }
+
+
// Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
// operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
// FIXME: This can probably go away once we default to widening legalization.
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
(VMOVSSZrm addr:$src)>;
- def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
- (VMOVSSZrm addr:$src)>;
// MOVSDrm zeros the high parts of the register; represent this
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
(VMOVSDZrm addr:$src)>;
- def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
- (VMOVSDZrm addr:$src)>;
// Represent the same patterns above but in the form they appear for
// 256-bit types
(VMOVDI2PDIZrm addr:$src)>;
def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
(VMOVDI2PDIZrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
- (VMOVDI2PDIZrm addr:$src)>;
def : Pat<(v4i32 (X86vzload addr:$src)),
(VMOVDI2PDIZrm addr:$src)>;
def : Pat<(v8i32 (X86vzload addr:$src)),
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
- def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
- (VMOVQI2PQIZrm addr:$src)>;
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
(VMOVZPQILo2PQIZrr VR128X:$src)>;
def : Pat<(v2i64 (X86vzload addr:$src)),
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
(VMOVSSrm addr:$src)>;
- def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
- (VMOVSSrm addr:$src)>;
def : Pat<(v4f32 (X86vzload addr:$src)),
(VMOVSSrm addr:$src)>;
// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
(VMOVSDrm addr:$src)>;
- def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
- (VMOVSDrm addr:$src)>;
def : Pat<(v2f64 (X86vzload addr:$src)),
(VMOVSDrm addr:$src)>;
// MOVSSrm already zeros the high parts of the register.
def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
(MOVSSrm addr:$src)>;
- def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
- (MOVSSrm addr:$src)>;
}
let Predicates = [UseSSE2] in {
// MOVSDrm already zeros the high parts of the register.
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
(MOVSDrm addr:$src)>;
- def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
- (MOVSDrm addr:$src)>;
}
//===----------------------------------------------------------------------===//
(VMOVDI2PDIrm addr:$src)>;
def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
(VMOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
- (VMOVDI2PDIrm addr:$src)>;
def : Pat<(v4i32 (X86vzload addr:$src)),
(VMOVDI2PDIrm addr:$src)>;
def : Pat<(v8i32 (X86vzload addr:$src)),
(MOVDI2PDIrm addr:$src)>;
def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
(MOVDI2PDIrm addr:$src)>;
- def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
- (MOVDI2PDIrm addr:$src)>;
def : Pat<(v4i32 (X86vzload addr:$src)),
(MOVDI2PDIrm addr:$src)>;
}
(MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
let Predicates = [UseAVX] in {
- def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
- (VMOVQI2PQIrm addr:$src)>;
def : Pat<(v2i64 (X86vzload addr:$src)),
(VMOVQI2PQIrm addr:$src)>;
def : Pat<(v4i64 (X86vzload addr:$src)),
}
let Predicates = [UseSSE2] in {
- def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
- (MOVQI2PQIrm addr:$src)>;
def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
def : Pat<(X86vextractstore (v2i64 VR128:$src), addr:$dst),
ret <2 x i64>%Y
}
-; FIXME: We shouldn't shrink the load to movss here since it is volatile.
define <4 x i32> @load_zmov_4i32_to_0zzz_volatile(<4 x i32> *%ptr) {
-; SSE-LABEL: load_zmov_4i32_to_0zzz_volatile:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: retq
+; SSE2-LABEL: load_zmov_4i32_to_0zzz_volatile:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm1
+; SSE2-NEXT: xorps %xmm0, %xmm0
+; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: load_zmov_4i32_to_0zzz_volatile:
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: movaps (%rdi), %xmm1
+; SSSE3-NEXT: xorps %xmm0, %xmm0
+; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: load_zmov_4i32_to_0zzz_volatile:
+; SSE41: # %bb.0: # %entry
+; SSE41-NEXT: movaps (%rdi), %xmm1
+; SSE41-NEXT: xorps %xmm0, %xmm0
+; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE41-NEXT: retq
;
; AVX-LABEL: load_zmov_4i32_to_0zzz_volatile:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vmovaps (%rdi), %xmm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
entry:
%X = load volatile <4 x i32>, <4 x i32>* %ptr
ret <4 x i32>%Y
}
-; FIXME: We shouldn't shrink the load to movsd here since it is volatile.
define <2 x i64> @load_zmov_2i64_to_0z_volatile(<2 x i64> *%ptr) {
; SSE-LABEL: load_zmov_2i64_to_0z_volatile:
; SSE: # %bb.0: # %entry
-; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movdqa (%rdi), %xmm0
+; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: load_zmov_2i64_to_0z_volatile:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: retq
entry:
%X = load volatile <2 x i64>, <2 x i64>* %ptr