; AVX-LABEL: test_zero_v4f64_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v4f64_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <4 x double> zeroinitializer, <4 x double>* %dst, align 1, !nontemporal !1
ret void
; AVX-LABEL: test_zero_v8f32_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8f32_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <8 x float> zeroinitializer, <8 x float>* %dst, align 1, !nontemporal !1
ret void
; AVX-LABEL: test_zero_v4i64_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v4i64_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 1, !nontemporal !1
ret void
; AVX-LABEL: test_zero_v8i32_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8i32_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 1, !nontemporal !1
ret void
; AVX-LABEL: test_zero_v16i16_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v16i16_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 1, !nontemporal !1
ret void
; AVX-LABEL: test_zero_v32i8_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v32i8_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 1, !nontemporal !1
ret void
; AVX-LABEL: test_zero_v4f64_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v4f64_align16:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
; AVX512-NEXT: retq
store <4 x double> zeroinitializer, <4 x double>* %dst, align 16, !nontemporal !1
ret void
; AVX-LABEL: test_zero_v8f32_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8f32_align16:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
; AVX512-NEXT: retq
store <8 x float> zeroinitializer, <8 x float>* %dst, align 16, !nontemporal !1
ret void
; AVX-LABEL: test_zero_v4i64_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v4i64_align16:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
; AVX512-NEXT: retq
store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 16, !nontemporal !1
ret void
; AVX-LABEL: test_zero_v8i32_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8i32_align16:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
; AVX512-NEXT: retq
store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 16, !nontemporal !1
ret void
; AVX-LABEL: test_zero_v16i16_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v16i16_align16:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
; AVX512-NEXT: retq
store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 16, !nontemporal !1
ret void
; AVX-LABEL: test_zero_v32i8_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v32i8_align16:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
; AVX512-NEXT: retq
store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 16, !nontemporal !1
ret void
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
-; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
-; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v8f64_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 48(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8f64_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 56(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 48(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 40(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 32(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 24(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: movq (%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; AVX512-NEXT: movntiq %rcx, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
; AVX512-NEXT: retq
store <8 x double> zeroinitializer, <8 x double>* %dst, align 1, !nontemporal !1
ret void
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
-; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
-; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v16f32_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 48(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v16f32_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 56(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 48(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 40(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 32(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 24(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: movq (%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; AVX512-NEXT: movntiq %rcx, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
; AVX512-NEXT: retq
store <16 x float> zeroinitializer, <16 x float>* %dst, align 1, !nontemporal !1
ret void
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
-; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
-; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v8i64_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 48(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8i64_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 56(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 48(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 40(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 32(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 24(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: movq (%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; AVX512-NEXT: movntiq %rcx, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
; AVX512-NEXT: retq
store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 1, !nontemporal !1
ret void
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
-; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
-; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v16i32_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 48(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v16i32_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 56(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 48(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 40(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 32(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 24(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: movq (%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; AVX512-NEXT: movntiq %rcx, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
; AVX512-NEXT: retq
store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 1, !nontemporal !1
ret void
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
-; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
-; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v32i16_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 48(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 32(%rdi)
; AVX-NEXT: retq
;
-; AVX512DQ-LABEL: test_zero_v32i16_align1:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX512DQ-NEXT: vmovups %ymm0, (%rdi)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_zero_v32i16_align1:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 56(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 48(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 40(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 32(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 24(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 16(%rdi)
-; AVX512BW-NEXT: movq (%rsp), %rax
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX512BW-NEXT: movntiq %rcx, 8(%rdi)
-; AVX512BW-NEXT: movntiq %rax, (%rdi)
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_zero_v32i16_align1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
+; AVX512-NEXT: retq
store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 1, !nontemporal !1
ret void
}
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
-; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
-; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v64i8_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 48(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 32(%rdi)
; AVX-NEXT: retq
;
-; AVX512DQ-LABEL: test_zero_v64i8_align1:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX512DQ-NEXT: vmovups %ymm0, (%rdi)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_zero_v64i8_align1:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 56(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 48(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 40(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 32(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 24(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 16(%rdi)
-; AVX512BW-NEXT: movq (%rsp), %rax
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX512BW-NEXT: movntiq %rcx, 8(%rdi)
-; AVX512BW-NEXT: movntiq %rax, (%rdi)
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_zero_v64i8_align1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
+; AVX512-NEXT: retq
store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 1, !nontemporal !1
ret void
}
; SSE-LABEL: test_zero_v8f64_align16:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movntps %xmm0, 48(%rdi)
-; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: movntps %xmm0, 48(%rdi)
+; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v8f64_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8f64_align16:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovaps (%rsp), %xmm0
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512-NEXT: vmovntps %xmm3, 48(%rdi)
-; AVX512-NEXT: vmovntps %xmm2, 32(%rdi)
-; AVX512-NEXT: vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
; AVX512-NEXT: vmovntps %xmm0, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX512-NEXT: retq
store <8 x double> zeroinitializer, <8 x double>* %dst, align 16, !nontemporal !1
ret void
; SSE-LABEL: test_zero_v16f32_align16:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movntps %xmm0, 48(%rdi)
-; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: movntps %xmm0, 48(%rdi)
+; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v16f32_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v16f32_align16:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovaps (%rsp), %xmm0
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512-NEXT: vmovntps %xmm3, 48(%rdi)
-; AVX512-NEXT: vmovntps %xmm2, 32(%rdi)
-; AVX512-NEXT: vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
; AVX512-NEXT: vmovntps %xmm0, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX512-NEXT: retq
store <16 x float> zeroinitializer, <16 x float>* %dst, align 16, !nontemporal !1
ret void
; SSE-LABEL: test_zero_v8i64_align16:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movntps %xmm0, 48(%rdi)
-; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: movntps %xmm0, 48(%rdi)
+; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v8i64_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8i64_align16:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovaps (%rsp), %xmm0
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512-NEXT: vmovntps %xmm3, 48(%rdi)
-; AVX512-NEXT: vmovntps %xmm2, 32(%rdi)
-; AVX512-NEXT: vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
; AVX512-NEXT: vmovntps %xmm0, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX512-NEXT: retq
store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 16, !nontemporal !1
ret void
; SSE-LABEL: test_zero_v16i32_align16:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movntps %xmm0, 48(%rdi)
-; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: movntps %xmm0, 48(%rdi)
+; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v16i32_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v16i32_align16:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovaps (%rsp), %xmm0
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512-NEXT: vmovntps %xmm3, 48(%rdi)
-; AVX512-NEXT: vmovntps %xmm2, 32(%rdi)
-; AVX512-NEXT: vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
; AVX512-NEXT: vmovntps %xmm0, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX512-NEXT: retq
store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 16, !nontemporal !1
ret void
; SSE-LABEL: test_zero_v32i16_align16:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movntps %xmm0, 48(%rdi)
-; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: movntps %xmm0, 48(%rdi)
+; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v32i16_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX-NEXT: retq
;
-; AVX512DQ-LABEL: test_zero_v32i16_align16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX512DQ-NEXT: vmovups %ymm0, (%rdi)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_zero_v32i16_align16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: vmovaps (%rsp), %xmm0
-; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512BW-NEXT: vmovntps %xmm3, 48(%rdi)
-; AVX512BW-NEXT: vmovntps %xmm2, 32(%rdi)
-; AVX512BW-NEXT: vmovntps %xmm1, 16(%rdi)
-; AVX512BW-NEXT: vmovntps %xmm0, (%rdi)
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_zero_v32i16_align16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 32(%rdi)
+; AVX512-NEXT: retq
store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 16, !nontemporal !1
ret void
}
; SSE-LABEL: test_zero_v64i8_align16:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movntps %xmm0, 48(%rdi)
-; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: movntps %xmm0, 48(%rdi)
+; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v64i8_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX-NEXT: retq
;
-; AVX512DQ-LABEL: test_zero_v64i8_align16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX512DQ-NEXT: vmovups %ymm0, (%rdi)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_zero_v64i8_align16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: vmovaps (%rsp), %xmm0
-; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512BW-NEXT: vmovntps %xmm3, 48(%rdi)
-; AVX512BW-NEXT: vmovntps %xmm2, 32(%rdi)
-; AVX512BW-NEXT: vmovntps %xmm1, 16(%rdi)
-; AVX512BW-NEXT: vmovntps %xmm0, (%rdi)
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_zero_v64i8_align16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 32(%rdi)
+; AVX512-NEXT: retq
store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 16, !nontemporal !1
ret void
}
;
; AVX512-LABEL: test_zero_v8f64_align32:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovaps (%rsp), %ymm0
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512-NEXT: vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT: vmovntps %ymm0, 32(%rdi)
; AVX512-NEXT: vmovntps %ymm0, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <8 x double> zeroinitializer, <8 x double>* %dst, align 32, !nontemporal !1
;
; AVX512-LABEL: test_zero_v16f32_align32:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovaps (%rsp), %ymm0
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512-NEXT: vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT: vmovntps %ymm0, 32(%rdi)
; AVX512-NEXT: vmovntps %ymm0, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <16 x float> zeroinitializer, <16 x float>* %dst, align 32, !nontemporal !1
;
; AVX512-LABEL: test_zero_v8i64_align32:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovaps (%rsp), %ymm0
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512-NEXT: vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT: vmovntps %ymm0, 32(%rdi)
; AVX512-NEXT: vmovntps %ymm0, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 32, !nontemporal !1
;
; AVX512-LABEL: test_zero_v16i32_align32:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovaps (%rsp), %ymm0
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512-NEXT: vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT: vmovntps %ymm0, 32(%rdi)
; AVX512-NEXT: vmovntps %ymm0, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 32, !nontemporal !1
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
-; AVX512DQ-LABEL: test_zero_v32i16_align32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovntps %ymm0, 32(%rdi)
-; AVX512DQ-NEXT: vmovntps %ymm0, (%rdi)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_zero_v32i16_align32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: vmovaps (%rsp), %ymm0
-; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512BW-NEXT: vmovntps %ymm1, 32(%rdi)
-; AVX512BW-NEXT: vmovntps %ymm0, (%rdi)
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_zero_v32i16_align32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovntps %ymm0, 32(%rdi)
+; AVX512-NEXT: vmovntps %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 32, !nontemporal !1
ret void
}
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
-; AVX512DQ-LABEL: test_zero_v64i8_align32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovntps %ymm0, 32(%rdi)
-; AVX512DQ-NEXT: vmovntps %ymm0, (%rdi)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_zero_v64i8_align32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: vmovaps (%rsp), %ymm0
-; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512BW-NEXT: vmovntps %ymm1, 32(%rdi)
-; AVX512BW-NEXT: vmovntps %ymm0, (%rdi)
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_zero_v64i8_align32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovntps %ymm0, 32(%rdi)
+; AVX512-NEXT: vmovntps %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 32, !nontemporal !1
ret void
}