From: Simon Pilgrim Date: Mon, 17 Jun 2019 17:22:38 +0000 (+0000) Subject: [X86][AVX] Split under-aligned vector nt-stores. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=1d6c2fbdd1f8dfe6534b81df84755ee580bb50f7;p=llvm [X86][AVX] Split under-aligned vector nt-stores. If a YMM/ZMM non-temporal store has less than natural alignment, split the vector - either they will be satisfactorily aligned or will continue to be split until they are XMMs - at which point the legalizer will scalarize it. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363582 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8827ff3de3f..c9b8e5fa2c0 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -39545,6 +39545,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, EVT VT = St->getValue().getValueType(); EVT StVT = St->getMemoryVT(); SDLoc dl(St); + unsigned Alignment = St->getAlignment(); SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); @@ -39595,8 +39596,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, StoredVal->ops().slice(32, 32)); Hi = combinevXi1ConstantToInteger(Hi, DAG); - unsigned Alignment = St->getAlignment(); - SDValue Ptr0 = St->getBasePtr(); SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl); @@ -39631,6 +39630,18 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, return splitVectorStore(St, DAG); } + // Split under-aligned vector non-temporal stores. + if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) { + // ZMM/YMM nt-stores - either it can be stored as a series of shorter + // vectors or the legalizer can scalarize it to use MOVNTI. + if (VT.is256BitVector() || VT.is512BitVector()) { + unsigned NumElems = VT.getVectorNumElements(); + if (NumElems < 2) + return SDValue(); + return splitVectorStore(St, DAG); + } + } + // Optimize trunc store (of multiple scalars) to shuffle and store. // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll index aa3e7cda18c..dc4780991a5 100644 --- a/test/CodeGen/X86/nontemporal-2.ll +++ b/test/CodeGen/X86/nontemporal-2.ll @@ -1230,9 +1230,7 @@ define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) { } ; 256-bit NT stores require 256-bit alignment. -; FIXME: For AVX, we could lower this to 2x movntps %xmm. Taken further, we -; could even scalarize to movnti when we have 1-alignment: nontemporal is -; probably always worth even some 20 instruction scalarization. +; For AVX, we lower 128-bit alignment as 2x movntps %xmm. define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) { ; SSE-LABEL: test_unaligned_v8f32: ; SSE: # %bb.0: @@ -1245,14 +1243,18 @@ define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* % ; AVX-LABEL: test_unaligned_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vmovups %ymm0, (%rdi) +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovntps %xmm1, 16(%rdi) +; AVX-NEXT: vmovntps %xmm0, (%rdi) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; VLX-LABEL: test_unaligned_v8f32: ; VLX: # %bb.0: ; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; VLX-NEXT: vmovups %ymm0, (%rdi) +; VLX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; VLX-NEXT: vmovntps %xmm1, 16(%rdi) +; VLX-NEXT: vmovntps %xmm0, (%rdi) ; VLX-NEXT: vzeroupper ; VLX-NEXT: retq %r = fadd <8 x float> %a, %b diff --git a/test/CodeGen/X86/nontemporal-3.ll b/test/CodeGen/X86/nontemporal-3.ll index 6f0d31d42d2..b2b57440717 100644 --- a/test/CodeGen/X86/nontemporal-3.ll +++ b/test/CodeGen/X86/nontemporal-3.ll @@ -236,15 +236,31 @@ define void @test_zero_v4f64_align1(<4 x double>* %dst) nounwind { ; AVX-LABEL: test_zero_v4f64_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v4f64_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, (%rdi) -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <4 x double> zeroinitializer, <4 x double>* %dst, align 1, !nontemporal !1 ret void @@ -269,15 +285,31 @@ define void @test_zero_v8f32_align1(<8 x float>* %dst) nounwind { ; AVX-LABEL: test_zero_v8f32_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8f32_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, (%rdi) -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <8 x float> zeroinitializer, <8 x float>* %dst, align 1, !nontemporal !1 ret void @@ -302,15 +334,31 @@ define void @test_zero_v4i64_align1(<4 x i64>* %dst) nounwind { ; AVX-LABEL: test_zero_v4i64_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v4i64_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, (%rdi) -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 1, !nontemporal !1 ret void @@ -335,15 +383,31 @@ define void @test_zero_v8i32_align1(<8 x i32>* %dst) nounwind { ; AVX-LABEL: test_zero_v8i32_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8i32_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, (%rdi) -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 1, !nontemporal !1 ret void @@ -368,15 +432,31 @@ define void @test_zero_v16i16_align1(<16 x i16>* %dst) nounwind { ; AVX-LABEL: test_zero_v16i16_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v16i16_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, (%rdi) -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 1, !nontemporal !1 ret void @@ -401,15 +481,31 @@ define void @test_zero_v32i8_align1(<32 x i8>* %dst) nounwind { ; AVX-LABEL: test_zero_v32i8_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v32i8_align1: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, (%rdi) -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 1, !nontemporal !1 ret void @@ -426,15 +522,15 @@ define void @test_zero_v4f64_align16(<4 x double>* %dst) nounwind { ; AVX-LABEL: test_zero_v4f64_align16: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX-NEXT: vmovntps %xmm0, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v4f64_align16: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, (%rdi) -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntps %xmm0, (%rdi) ; AVX512-NEXT: retq store <4 x double> zeroinitializer, <4 x double>* %dst, align 16, !nontemporal !1 ret void @@ -451,15 +547,15 @@ define void @test_zero_v8f32_align16(<8 x float>* %dst) nounwind { ; AVX-LABEL: test_zero_v8f32_align16: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX-NEXT: vmovntps %xmm0, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8f32_align16: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, (%rdi) -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntps %xmm0, (%rdi) ; AVX512-NEXT: retq store <8 x float> zeroinitializer, <8 x float>* %dst, align 16, !nontemporal !1 ret void @@ -476,15 +572,15 @@ define void @test_zero_v4i64_align16(<4 x i64>* %dst) nounwind { ; AVX-LABEL: test_zero_v4i64_align16: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX-NEXT: vmovntps %xmm0, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v4i64_align16: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, (%rdi) -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntps %xmm0, (%rdi) ; AVX512-NEXT: retq store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 16, !nontemporal !1 ret void @@ -501,15 +597,15 @@ define void @test_zero_v8i32_align16(<8 x i32>* %dst) nounwind { ; AVX-LABEL: test_zero_v8i32_align16: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX-NEXT: vmovntps %xmm0, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8i32_align16: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, (%rdi) -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntps %xmm0, (%rdi) ; AVX512-NEXT: retq store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 16, !nontemporal !1 ret void @@ -526,15 +622,15 @@ define void @test_zero_v16i16_align16(<16 x i16>* %dst) nounwind { ; AVX-LABEL: test_zero_v16i16_align16: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX-NEXT: vmovntps %xmm0, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v16i16_align16: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, (%rdi) -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntps %xmm0, (%rdi) ; AVX512-NEXT: retq store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 16, !nontemporal !1 ret void @@ -551,15 +647,15 @@ define void @test_zero_v32i8_align16(<32 x i8>* %dst) nounwind { ; AVX-LABEL: test_zero_v32i8_align16: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX-NEXT: vmovntps %xmm0, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v32i8_align16: ; AVX512: # %bb.0: ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovups %ymm0, (%rdi) -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntps %xmm0, (%rdi) ; AVX512-NEXT: retq store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 16, !nontemporal !1 ret void @@ -574,60 +670,73 @@ define void @test_zero_v8f64_align1(<8 x double>* %dst) nounwind { ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 56(%rdi) -; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 40(%rdi) -; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movntiq %rcx, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rcx, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v8f64_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, 32(%rdi) -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 40(%rdi) +; AVX-NEXT: movntiq %rax, 32(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8f64_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-64, %rsp -; AVX512-NEXT: subq $128, %rsp ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %zmm0, (%rsp) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 48(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 40(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 32(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 24(%rdi) ; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: movq (%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movntiq %rcx, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 40(%rdi) +; AVX512-NEXT: movntiq %rax, 32(%rdi) ; AVX512-NEXT: retq store <8 x double> zeroinitializer, <8 x double>* %dst, align 1, !nontemporal !1 ret void @@ -640,60 +749,73 @@ define void @test_zero_v16f32_align1(<16 x float>* %dst) nounwind { ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 56(%rdi) -; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 40(%rdi) -; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movntiq %rcx, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rcx, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v16f32_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, 32(%rdi) -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 40(%rdi) +; AVX-NEXT: movntiq %rax, 32(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v16f32_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-64, %rsp -; AVX512-NEXT: subq $128, %rsp ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %zmm0, (%rsp) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 48(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 40(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 32(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 24(%rdi) ; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: movq (%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movntiq %rcx, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 40(%rdi) +; AVX512-NEXT: movntiq %rax, 32(%rdi) ; AVX512-NEXT: retq store <16 x float> zeroinitializer, <16 x float>* %dst, align 1, !nontemporal !1 ret void @@ -706,60 +828,73 @@ define void @test_zero_v8i64_align1(<8 x i64>* %dst) nounwind { ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 56(%rdi) -; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 40(%rdi) -; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movntiq %rcx, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rcx, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v8i64_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, 32(%rdi) -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 40(%rdi) +; AVX-NEXT: movntiq %rax, 32(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8i64_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-64, %rsp -; AVX512-NEXT: subq $128, %rsp ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %zmm0, (%rsp) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 48(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 40(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 32(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 24(%rdi) ; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: movq (%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movntiq %rcx, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 40(%rdi) +; AVX512-NEXT: movntiq %rax, 32(%rdi) ; AVX512-NEXT: retq store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 1, !nontemporal !1 ret void @@ -772,60 +907,73 @@ define void @test_zero_v16i32_align1(<16 x i32>* %dst) nounwind { ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 56(%rdi) -; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 40(%rdi) -; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movntiq %rcx, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rcx, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v16i32_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, 32(%rdi) -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 40(%rdi) +; AVX-NEXT: movntiq %rax, 32(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v16i32_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-64, %rsp -; AVX512-NEXT: subq $128, %rsp ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %zmm0, (%rsp) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 56(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 48(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 40(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 32(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movntiq %rax, 24(%rdi) -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 24(%rdi) ; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: movq (%rsp), %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movntiq %rcx, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 40(%rdi) +; AVX512-NEXT: movntiq %rax, 32(%rdi) ; AVX512-NEXT: retq store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 1, !nontemporal !1 ret void @@ -838,69 +986,74 @@ define void @test_zero_v32i16_align1(<32 x i16>* %dst) nounwind { ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 56(%rdi) -; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 40(%rdi) -; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movntiq %rcx, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rcx, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v32i16_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, 32(%rdi) -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 40(%rdi) +; AVX-NEXT: movntiq %rax, 32(%rdi) ; AVX-NEXT: retq ; -; AVX512DQ-LABEL: test_zero_v32i16_align1: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovups %ymm0, 32(%rdi) -; AVX512DQ-NEXT: vmovups %ymm0, (%rdi) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: test_zero_v32i16_align1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: pushq %rbp -; AVX512BW-NEXT: movq %rsp, %rbp -; AVX512BW-NEXT: andq $-64, %rsp -; AVX512BW-NEXT: subq $128, %rsp -; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movntiq %rax, 56(%rdi) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movntiq %rax, 48(%rdi) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movntiq %rax, 40(%rdi) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movntiq %rax, 32(%rdi) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movntiq %rax, 24(%rdi) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movntiq %rax, 16(%rdi) -; AVX512BW-NEXT: movq (%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512BW-NEXT: movntiq %rcx, 8(%rdi) -; AVX512BW-NEXT: movntiq %rax, (%rdi) -; AVX512BW-NEXT: movq %rbp, %rsp -; AVX512BW-NEXT: popq %rbp -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: test_zero_v32i16_align1: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 40(%rdi) +; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: retq store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 1, !nontemporal !1 ret void } @@ -912,69 +1065,74 @@ define void @test_zero_v64i8_align1(<64 x i8>* %dst) nounwind { ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 56(%rdi) -; SSE-NEXT: movntiq %rax, 48(%rdi) +; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 40(%rdi) -; SSE-NEXT: movntiq %rax, 32(%rdi) +; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) +; SSE-NEXT: movntiq %rcx, 56(%rdi) +; SSE-NEXT: movntiq %rax, 48(%rdi) ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rcx, 40(%rdi) +; SSE-NEXT: movntiq %rax, 32(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v64i8_align1: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, 32(%rdi) -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, (%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 56(%rdi) +; AVX-NEXT: movntiq %rax, 48(%rdi) +; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movntiq %rcx, 40(%rdi) +; AVX-NEXT: movntiq %rax, 32(%rdi) ; AVX-NEXT: retq ; -; AVX512DQ-LABEL: test_zero_v64i8_align1: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovups %ymm0, 32(%rdi) -; AVX512DQ-NEXT: vmovups %ymm0, (%rdi) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: test_zero_v64i8_align1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: pushq %rbp -; AVX512BW-NEXT: movq %rsp, %rbp -; AVX512BW-NEXT: andq $-64, %rsp -; AVX512BW-NEXT: subq $128, %rsp -; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movntiq %rax, 56(%rdi) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movntiq %rax, 48(%rdi) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movntiq %rax, 40(%rdi) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movntiq %rax, 32(%rdi) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movntiq %rax, 24(%rdi) -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: movntiq %rax, 16(%rdi) -; AVX512BW-NEXT: movq (%rsp), %rax -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512BW-NEXT: movntiq %rcx, 8(%rdi) -; AVX512BW-NEXT: movntiq %rax, (%rdi) -; AVX512BW-NEXT: movq %rbp, %rsp -; AVX512BW-NEXT: popq %rbp -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: test_zero_v64i8_align1: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, (%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 48(%rdi) +; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: movntiq %rcx, 40(%rdi) +; AVX512-NEXT: movntiq %rax, 32(%rdi) +; AVX512-NEXT: retq store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 1, !nontemporal !1 ret void } @@ -983,39 +1141,28 @@ define void @test_zero_v8f64_align16(<8 x double>* %dst) nounwind { ; SSE-LABEL: test_zero_v8f64_align16: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movntps %xmm0, 48(%rdi) -; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: movntps %xmm0, 16(%rdi) ; SSE-NEXT: movntps %xmm0, (%rdi) +; SSE-NEXT: movntps %xmm0, 48(%rdi) +; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v8f64_align16: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, 32(%rdi) -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX-NEXT: vmovntps %xmm0, (%rdi) +; AVX-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX-NEXT: vmovntps %xmm0, 32(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8f64_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-64, %rsp -; AVX512-NEXT: subq $128, %rsp ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %zmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %xmm0 -; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 -; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2 -; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 -; AVX512-NEXT: vmovntps %xmm3, 48(%rdi) -; AVX512-NEXT: vmovntps %xmm2, 32(%rdi) -; AVX512-NEXT: vmovntps %xmm1, 16(%rdi) +; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) ; AVX512-NEXT: vmovntps %xmm0, (%rdi) -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) ; AVX512-NEXT: retq store <8 x double> zeroinitializer, <8 x double>* %dst, align 16, !nontemporal !1 ret void @@ -1025,39 +1172,28 @@ define void @test_zero_v16f32_align16(<16 x float>* %dst) nounwind { ; SSE-LABEL: test_zero_v16f32_align16: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movntps %xmm0, 48(%rdi) -; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: movntps %xmm0, 16(%rdi) ; SSE-NEXT: movntps %xmm0, (%rdi) +; SSE-NEXT: movntps %xmm0, 48(%rdi) +; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v16f32_align16: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, 32(%rdi) -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX-NEXT: vmovntps %xmm0, (%rdi) +; AVX-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX-NEXT: vmovntps %xmm0, 32(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v16f32_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-64, %rsp -; AVX512-NEXT: subq $128, %rsp ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %zmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %xmm0 -; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 -; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2 -; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 -; AVX512-NEXT: vmovntps %xmm3, 48(%rdi) -; AVX512-NEXT: vmovntps %xmm2, 32(%rdi) -; AVX512-NEXT: vmovntps %xmm1, 16(%rdi) +; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) ; AVX512-NEXT: vmovntps %xmm0, (%rdi) -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) ; AVX512-NEXT: retq store <16 x float> zeroinitializer, <16 x float>* %dst, align 16, !nontemporal !1 ret void @@ -1067,39 +1203,28 @@ define void @test_zero_v8i64_align16(<8 x i64>* %dst) nounwind { ; SSE-LABEL: test_zero_v8i64_align16: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movntps %xmm0, 48(%rdi) -; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: movntps %xmm0, 16(%rdi) ; SSE-NEXT: movntps %xmm0, (%rdi) +; SSE-NEXT: movntps %xmm0, 48(%rdi) +; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v8i64_align16: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, 32(%rdi) -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX-NEXT: vmovntps %xmm0, (%rdi) +; AVX-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX-NEXT: vmovntps %xmm0, 32(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8i64_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-64, %rsp -; AVX512-NEXT: subq $128, %rsp ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %zmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %xmm0 -; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 -; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2 -; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 -; AVX512-NEXT: vmovntps %xmm3, 48(%rdi) -; AVX512-NEXT: vmovntps %xmm2, 32(%rdi) -; AVX512-NEXT: vmovntps %xmm1, 16(%rdi) +; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) ; AVX512-NEXT: vmovntps %xmm0, (%rdi) -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) ; AVX512-NEXT: retq store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 16, !nontemporal !1 ret void @@ -1109,39 +1234,28 @@ define void @test_zero_v16i32_align16(<16 x i32>* %dst) nounwind { ; SSE-LABEL: test_zero_v16i32_align16: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movntps %xmm0, 48(%rdi) -; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: movntps %xmm0, 16(%rdi) ; SSE-NEXT: movntps %xmm0, (%rdi) +; SSE-NEXT: movntps %xmm0, 48(%rdi) +; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v16i32_align16: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, 32(%rdi) -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX-NEXT: vmovntps %xmm0, (%rdi) +; AVX-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX-NEXT: vmovntps %xmm0, 32(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v16i32_align16: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-64, %rsp -; AVX512-NEXT: subq $128, %rsp ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %zmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %xmm0 -; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 -; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2 -; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 -; AVX512-NEXT: vmovntps %xmm3, 48(%rdi) -; AVX512-NEXT: vmovntps %xmm2, 32(%rdi) -; AVX512-NEXT: vmovntps %xmm1, 16(%rdi) +; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) ; AVX512-NEXT: vmovntps %xmm0, (%rdi) -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) ; AVX512-NEXT: retq store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 16, !nontemporal !1 ret void @@ -1151,48 +1265,29 @@ define void @test_zero_v32i16_align16(<32 x i16>* %dst) nounwind { ; SSE-LABEL: test_zero_v32i16_align16: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movntps %xmm0, 48(%rdi) -; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: movntps %xmm0, 16(%rdi) ; SSE-NEXT: movntps %xmm0, (%rdi) +; SSE-NEXT: movntps %xmm0, 48(%rdi) +; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v32i16_align16: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, 32(%rdi) -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX-NEXT: vmovntps %xmm0, (%rdi) +; AVX-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX-NEXT: vmovntps %xmm0, 32(%rdi) ; AVX-NEXT: retq ; -; AVX512DQ-LABEL: test_zero_v32i16_align16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovups %ymm0, 32(%rdi) -; AVX512DQ-NEXT: vmovups %ymm0, (%rdi) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: test_zero_v32i16_align16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: pushq %rbp -; AVX512BW-NEXT: movq %rsp, %rbp -; AVX512BW-NEXT: andq $-64, %rsp -; AVX512BW-NEXT: subq $128, %rsp -; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) -; AVX512BW-NEXT: vmovaps (%rsp), %xmm0 -; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 -; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2 -; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 -; AVX512BW-NEXT: vmovntps %xmm3, 48(%rdi) -; AVX512BW-NEXT: vmovntps %xmm2, 32(%rdi) -; AVX512BW-NEXT: vmovntps %xmm1, 16(%rdi) -; AVX512BW-NEXT: vmovntps %xmm0, (%rdi) -; AVX512BW-NEXT: movq %rbp, %rsp -; AVX512BW-NEXT: popq %rbp -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: test_zero_v32i16_align16: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntps %xmm0, (%rdi) +; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) +; AVX512-NEXT: retq store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 16, !nontemporal !1 ret void } @@ -1201,48 +1296,29 @@ define void @test_zero_v64i8_align16(<64 x i8>* %dst) nounwind { ; SSE-LABEL: test_zero_v64i8_align16: ; SSE: # %bb.0: ; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movntps %xmm0, 48(%rdi) -; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: movntps %xmm0, 16(%rdi) ; SSE-NEXT: movntps %xmm0, (%rdi) +; SSE-NEXT: movntps %xmm0, 48(%rdi) +; SSE-NEXT: movntps %xmm0, 32(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v64i8_align16: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovups %ymm0, 32(%rdi) -; AVX-NEXT: vmovups %ymm0, (%rdi) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX-NEXT: vmovntps %xmm0, (%rdi) +; AVX-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX-NEXT: vmovntps %xmm0, 32(%rdi) ; AVX-NEXT: retq ; -; AVX512DQ-LABEL: test_zero_v64i8_align16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovups %ymm0, 32(%rdi) -; AVX512DQ-NEXT: vmovups %ymm0, (%rdi) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: test_zero_v64i8_align16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: pushq %rbp -; AVX512BW-NEXT: movq %rsp, %rbp -; AVX512BW-NEXT: andq $-64, %rsp -; AVX512BW-NEXT: subq $128, %rsp -; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) -; AVX512BW-NEXT: vmovaps (%rsp), %xmm0 -; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1 -; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2 -; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3 -; AVX512BW-NEXT: vmovntps %xmm3, 48(%rdi) -; AVX512BW-NEXT: vmovntps %xmm2, 32(%rdi) -; AVX512BW-NEXT: vmovntps %xmm1, 16(%rdi) -; AVX512BW-NEXT: vmovntps %xmm0, (%rdi) -; AVX512BW-NEXT: movq %rbp, %rsp -; AVX512BW-NEXT: popq %rbp -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: test_zero_v64i8_align16: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntps %xmm0, 16(%rdi) +; AVX512-NEXT: vmovntps %xmm0, (%rdi) +; AVX512-NEXT: vmovntps %xmm0, 48(%rdi) +; AVX512-NEXT: vmovntps %xmm0, 32(%rdi) +; AVX512-NEXT: retq store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 16, !nontemporal !1 ret void } @@ -1267,18 +1343,9 @@ define void @test_zero_v8f64_align32(<8 x double>* %dst) nounwind { ; ; AVX512-LABEL: test_zero_v8f64_align32: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-64, %rsp -; AVX512-NEXT: subq $128, %rsp ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %zmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %ymm0 -; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 -; AVX512-NEXT: vmovntps %ymm1, 32(%rdi) +; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) ; AVX512-NEXT: vmovntps %ymm0, (%rdi) -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <8 x double> zeroinitializer, <8 x double>* %dst, align 32, !nontemporal !1 @@ -1305,18 +1372,9 @@ define void @test_zero_v16f32_align32(<16 x float>* %dst) nounwind { ; ; AVX512-LABEL: test_zero_v16f32_align32: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-64, %rsp -; AVX512-NEXT: subq $128, %rsp ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %zmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %ymm0 -; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 -; AVX512-NEXT: vmovntps %ymm1, 32(%rdi) +; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) ; AVX512-NEXT: vmovntps %ymm0, (%rdi) -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <16 x float> zeroinitializer, <16 x float>* %dst, align 32, !nontemporal !1 @@ -1343,18 +1401,9 @@ define void @test_zero_v8i64_align32(<8 x i64>* %dst) nounwind { ; ; AVX512-LABEL: test_zero_v8i64_align32: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-64, %rsp -; AVX512-NEXT: subq $128, %rsp ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %zmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %ymm0 -; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 -; AVX512-NEXT: vmovntps %ymm1, 32(%rdi) +; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) ; AVX512-NEXT: vmovntps %ymm0, (%rdi) -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 32, !nontemporal !1 @@ -1381,18 +1430,9 @@ define void @test_zero_v16i32_align32(<16 x i32>* %dst) nounwind { ; ; AVX512-LABEL: test_zero_v16i32_align32: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: movq %rsp, %rbp -; AVX512-NEXT: andq $-64, %rsp -; AVX512-NEXT: subq $128, %rsp ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %zmm0, (%rsp) -; AVX512-NEXT: vmovaps (%rsp), %ymm0 -; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 -; AVX512-NEXT: vmovntps %ymm1, 32(%rdi) +; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) ; AVX512-NEXT: vmovntps %ymm0, (%rdi) -; AVX512-NEXT: movq %rbp, %rsp -; AVX512-NEXT: popq %rbp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 32, !nontemporal !1 @@ -1417,30 +1457,13 @@ define void @test_zero_v32i16_align32(<32 x i16>* %dst) nounwind { ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512DQ-LABEL: test_zero_v32i16_align32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovntps %ymm0, 32(%rdi) -; AVX512DQ-NEXT: vmovntps %ymm0, (%rdi) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: test_zero_v32i16_align32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: pushq %rbp -; AVX512BW-NEXT: movq %rsp, %rbp -; AVX512BW-NEXT: andq $-64, %rsp -; AVX512BW-NEXT: subq $128, %rsp -; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) -; AVX512BW-NEXT: vmovaps (%rsp), %ymm0 -; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 -; AVX512BW-NEXT: vmovntps %ymm1, 32(%rdi) -; AVX512BW-NEXT: vmovntps %ymm0, (%rdi) -; AVX512BW-NEXT: movq %rbp, %rsp -; AVX512BW-NEXT: popq %rbp -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: test_zero_v32i16_align32: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) +; AVX512-NEXT: vmovntps %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 32, !nontemporal !1 ret void } @@ -1463,30 +1486,13 @@ define void @test_zero_v64i8_align32(<64 x i8>* %dst) nounwind { ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512DQ-LABEL: test_zero_v64i8_align32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovntps %ymm0, 32(%rdi) -; AVX512DQ-NEXT: vmovntps %ymm0, (%rdi) -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: test_zero_v64i8_align32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: pushq %rbp -; AVX512BW-NEXT: movq %rsp, %rbp -; AVX512BW-NEXT: andq $-64, %rsp -; AVX512BW-NEXT: subq $128, %rsp -; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovaps %zmm0, (%rsp) -; AVX512BW-NEXT: vmovaps (%rsp), %ymm0 -; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 -; AVX512BW-NEXT: vmovntps %ymm1, 32(%rdi) -; AVX512BW-NEXT: vmovntps %ymm0, (%rdi) -; AVX512BW-NEXT: movq %rbp, %rsp -; AVX512BW-NEXT: popq %rbp -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512-LABEL: test_zero_v64i8_align32: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovntps %ymm0, 32(%rdi) +; AVX512-NEXT: vmovntps %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 32, !nontemporal !1 ret void }