(v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
(v4i32 (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)))), sub_xmm)>;
- def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
- (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), sub_xmm)>;
- def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
- (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), sub_xmm)>;
-
def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
(SUBREG_TO_REG (i32 0),
(v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
(SUBREG_TO_REG (i32 0),
(v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)),
(v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>;
-
- def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
- (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), sub_xmm)>;
-
- def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
- (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), sub_xmm)>;
-
}
// Use 128-bit blends for OptForSpeed since BLENDs have better throughput than
(v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
(v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)),
(i8 3))), sub_xmm)>;
-
- def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2f64 (VBLENDPDrri (v2f64 (V_SET0)),
- (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)),
- (i8 1))), sub_xmm)>;
- def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2i64 (VPBLENDWrri (v2i64 (V_SET0)),
- (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)),
- (i8 0xf))), sub_xmm)>;
}
let Predicates = [HasAVX512] in {
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
def : Pat<(v8i64 (X86vzload addr:$src)),
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
+
+ def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2f64 (VMOVZPQILo2PQIZrr
+ (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))),
+ sub_xmm)>;
+ def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2i64 (VMOVZPQILo2PQIZrr
+ (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))),
+ sub_xmm)>;
+
+ def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2f64 (VMOVZPQILo2PQIZrr
+ (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))),
+ sub_xmm)>;
+ def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2i64 (VMOVZPQILo2PQIZrr
+ (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))),
+ sub_xmm)>;
}
//===----------------------------------------------------------------------===//
(SUBREG_TO_REG (i32 0),
(v4i32 (VMOVSSrr (v4i32 (V_SET0)),
(v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
-
- def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2f64 (VMOVSDrr (v2f64 (V_SET0)),
- (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
- sub_xmm)>;
- def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2i64 (VMOVSDrr (v2i64 (V_SET0)),
- (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
- sub_xmm)>;
}
let Predicates = [UseSSE1] in {
(MOVZPQILo2PQIrr VR128:$src)>;
}
+let Predicates = [UseAVX] in {
+ def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2f64 (VMOVZPQILo2PQIrr
+ (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
+ sub_xmm)>;
+ def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (v2i64 (VMOVZPQILo2PQIrr
+ (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
+ sub_xmm)>;
+}
+
//===---------------------------------------------------------------------===//
// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
//===---------------------------------------------------------------------===//
(v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
(v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
(i8 3))), sub_xmm)>;
-
- def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2f64 (VBLENDPDrri (v2f64 (V_SET0)),
- (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)),
- (i8 1))), sub_xmm)>;
- def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
- (SUBREG_TO_REG (i32 0),
- (v2i64 (VPBLENDWrri (v2i64 (V_SET0)),
- (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)),
- (i8 0xf))), sub_xmm)>;
}
// Prefer a movss or movsd over a blendps when optimizing for size. these were
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: vmovups (%ecx), %xmm0
-; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; X32-NEXT: vmovaps %ymm0, (%eax)
+; X32-NEXT: vmovdqu (%ecx), %xmm0
+; X32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; X32-NEXT: vmovdqa %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: legal_vzmovl_2i64_4i64:
; X64: # %bb.0:
-; X64-NEXT: vmovups (%rdi), %xmm0
-; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; X64-NEXT: vmovaps %ymm0, (%rsi)
+; X64-NEXT: vmovdqu (%rdi), %xmm0
+; X64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; X64-NEXT: vmovdqa %ymm0, (%rsi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%ld = load <2 x i64>, <2 x i64>* %in, align 8
; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: vmovups (%ecx), %xmm0
-; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; X32-NEXT: vmovaps %ymm0, (%eax)
+; X32-NEXT: vmovdqu (%ecx), %xmm0
+; X32-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; X32-NEXT: vmovdqa %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: legal_vzmovl_2f64_4f64:
; X64: # %bb.0:
-; X64-NEXT: vmovups (%rdi), %xmm0
-; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; X64-NEXT: vmovaps %ymm0, (%rsi)
+; X64-NEXT: vmovdqu (%rdi), %xmm0
+; X64-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; X64-NEXT: vmovdqa %ymm0, (%rsi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%ld = load <2 x double>, <2 x double>* %in, align 8
; X32-AVX-NEXT: andl $-128, %esp
; X32-AVX-NEXT: subl $384, %esp # imm = 0x180
; X32-AVX-NEXT: movl 40(%ebp), %ecx
-; X32-AVX-NEXT: vbroadcastsd 32(%ebp), %ymm0
-; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; X32-AVX-NEXT: vpbroadcastq 32(%ebp), %ymm0
+; X32-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%esp)
; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
; X32-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp)
; X32-AVX-NEXT: vmovaps %ymm1, (%esp)
-; X32-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%esp)
; X32-AVX-NEXT: leal (%ecx,%ecx), %eax
; X32-AVX-NEXT: andl $31, %eax
; X32-AVX-NEXT: movl 128(%esp,%eax,4), %eax
; X64-AVX-NEXT: andq $-128, %rsp
; X64-AVX-NEXT: subq $256, %rsp # imm = 0x100
; X64-AVX-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-AVX-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[3,1,2,3]
-; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; X64-AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm3[3,1,2,3]
+; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
; X64-AVX-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
; X64-AVX-NEXT: vmovaps %ymm1, (%rsp)
-; X64-AVX-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
; X64-AVX-NEXT: andl $15, %edi
; X64-AVX-NEXT: movq (%rsp,%rdi,8), %rax
; X64-AVX-NEXT: movq %rbp, %rsp
; ALL-LABEL: insert_reg_and_zero_v4f64:
; ALL: # %bb.0:
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; ALL-NEXT: retq
%v = insertelement <4 x double> undef, double %a, i32 0
%shuffle = shufflevector <4 x double> %v, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
define <4 x double> @shuffle_v4f64_0zzz_optsize(<4 x double> %a) optsize {
; ALL-LABEL: shuffle_v4f64_0zzz_optsize:
; ALL: # %bb.0:
-; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; ALL-NEXT: retq
%b = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
ret <4 x double> %b
define <4 x i64> @shuffle_v4i64_0zzz_optsize(<4 x i64> %a) optsize {
; ALL-LABEL: shuffle_v4i64_0zzz_optsize:
; ALL: # %bb.0:
-; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; ALL-NEXT: retq
%b = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
ret <4 x i64> %b
define <8 x i64> @shuffle_v8i64_0zzzzzzz(<8 x i64> %a) {
; ALL-LABEL: shuffle_v8i64_0zzzzzzz:
; ALL: # %bb.0:
-; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; ALL-NEXT: ret{{[l|q]}}
%shuffle = shufflevector <8 x i64> %a, <8 x i64> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <8 x i64> %shuffle
define <8 x double> @shuffle_v8f64_0zzzzzzz(<8 x double> %a) {
; ALL-LABEL: shuffle_v8f64_0zzzzzzz:
; ALL: # %bb.0:
-; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; ALL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; ALL-NEXT: ret{{[l|q]}}
%shuffle = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <8 x double> %shuffle
define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) {
; CHECK-LABEL: combine_pshufb_as_vzmovl_64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; CHECK-NEXT: ret{{[l|q]}}
%1 = bitcast <4 x double> %a0 to <32 x i8>
%2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)