}
}
+ // Pull subvector inserts into undef through VZEXT_MOVL by making it an
+ // insert into a zero vector. This helps get VZEXT_MOVL closer to
+ // scalar_to_vectors where 256/512 are canonicalized to an insert and a
+ // 128-bit scalar_to_vector. This reduces the number of isel patterns.
+ if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
+ N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR &&
+ N->getOperand(0).hasOneUse() &&
+ N->getOperand(0).getOperand(0).isUndef() &&
+ isNullConstant(N->getOperand(0).getOperand(2))) {
+ SDValue In = N->getOperand(0).getOperand(1);
+ SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
+ getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
+ Movl, N->getOperand(0).getOperand(2));
+ }
+
// Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the
// operands is an extend from v2i32 to v2i64. Turn it into a pmulld.
// FIXME: This can probably go away once we default to widening legalization.
// Represent the same patterns above but in the form they appear for
// 256-bit types
- def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
- def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
- (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
def : Pat<(v8f32 (X86vzload addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
- def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
- (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
def : Pat<(v4f64 (X86vzload addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
// Represent the same patterns above but in the form they appear for
// 512-bit types
- def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
- def : Pat<(v16f32 (X86vzmovl (insert_subvector undef,
- (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
def : Pat<(v16f32 (X86vzload addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
- def : Pat<(v8f64 (X86vzmovl (insert_subvector undef,
- (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
def : Pat<(v8f64 (X86vzload addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
-
- def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
}
let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
(VMOV64toPQIZrr GR64:$src)>;
- def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>;
-
- def : Pat<(v8i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>;
-
// AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
(VMOVDI2PDIZrm addr:$src)>;
def : Pat<(v4i64 (X86vzload addr:$src)),
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
- // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
- def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>;
- def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>;
-
// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
def : Pat<(v16i32 (X86vzload addr:$src)),
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
// Represent the same patterns above but in the form they appear for
// 256-bit types
- def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
- (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
def : Pat<(v8f32 (X86vzload addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
- def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
- (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
def : Pat<(v4f64 (X86vzload addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
}
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
(VMOV64toPQIrr GR64:$src)>;
- def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>;
// AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
// These instructions also write zeros in the high part of a 256-bit register.
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
(VMOVDI2PDIrm addr:$src)>;
def : Pat<(v4i32 (X86vzload addr:$src)),
(VMOVDI2PDIrm addr:$src)>;
- def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
def : Pat<(v8i32 (X86vzload addr:$src)),
(SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
- // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
- def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
- (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>;
}
let Predicates = [UseSSE2] in {
(VMOVQI2PQIrm addr:$src)>;
def : Pat<(v2i64 (X86vzload addr:$src)),
(VMOVQI2PQIrm addr:$src)>;
- def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
- (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
def : Pat<(v4i64 (X86vzload addr:$src)),
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
; CHECK_O0-NEXT: .LBB9_3: # %cif_mixed_test_all
; CHECK_O0-NEXT: movl $-1, %eax
; CHECK_O0-NEXT: vmovd %eax, %xmm0
+; CHECK_O0-NEXT: vmovdqa %xmm0, %xmm0
; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1
; CHECK_O0-NEXT: # implicit-def: $rcx
; CHECK_O0-NEXT: # implicit-def: $ymm2
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: vmovdqu (%ecx), %xmm0
-; X32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X32-NEXT: vmovdqa %ymm0, (%eax)
+; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: vmovaps %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: legal_vzmovl_2i64_4i64:
; X64: # %bb.0:
-; X64-NEXT: vmovdqu (%rdi), %xmm0
-; X64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X64-NEXT: vmovdqa %ymm0, (%rsi)
+; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: vmovaps %ymm0, (%rsi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%ld = load <2 x i64>, <2 x i64>* %in, align 8
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: vmovdqu (%ecx), %xmm0
-; X32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X32-NEXT: vmovdqa %ymm0, (%eax)
+; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: vmovaps %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: legal_vzmovl_2f64_4f64:
; X64: # %bb.0:
-; X64-NEXT: vmovdqu (%rdi), %xmm0
-; X64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X64-NEXT: vmovdqa %ymm0, (%rsi)
+; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: vmovaps %ymm0, (%rsi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%ld = load <2 x double>, <2 x double>* %in, align 8
define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
; ALL-LABEL: insert_reg_and_zero_v4f64:
; ALL: # %bb.0:
-; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; ALL-NEXT: retq
%v = insertelement <4 x double> undef, double %a, i32 0