From: Simon Pilgrim Date: Mon, 17 Jun 2019 18:20:04 +0000 (+0000) Subject: [X86][SSE] Scalarize under-aligned XMM vector nt-stores (PR42026) X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=22af01a41402861dd1eaf9a2210a247b7c5a7f65;p=llvm [X86][SSE] Scalarize under-aligned XMM vector nt-stores (PR42026) If a XMM non-temporal store has less than natural alignment, scalarize the vector - with SSE4A we can stay on the vector and use MOVNTSD(f64), else we must move to GPRs and use MOVNTI(i32/i64). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363592 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index c9b8e5fa2c0..42fcb5e92e9 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -21110,6 +21110,42 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) { return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1); } +/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar +/// type. +static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, + SelectionDAG &DAG) { + SDValue StoredVal = Store->getValue(); + assert(StoreVT.is128BitVector() && + StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"); + StoredVal = DAG.getBitcast(StoreVT, StoredVal); + + // Splitting volatile memory ops is not allowed unless the operation was not + // legal to begin with. We are assuming the input op is legal (this transform + // is only used for targets with AVX). + if (Store->isVolatile()) + return SDValue(); + + MVT StoreSVT = StoreVT.getScalarType(); + unsigned NumElems = StoreVT.getVectorNumElements(); + unsigned ScalarSize = StoreSVT.getStoreSize(); + unsigned Alignment = Store->getAlignment(); + + SDLoc DL(Store); + SmallVector Stores; + for (unsigned i = 0; i != NumElems; ++i) { + unsigned Offset = i * ScalarSize; + SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL); + SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal, + DAG.getIntPtrConstant(i, DL)); + SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr, + Store->getPointerInfo().getWithOffset(Offset), + MinAlign(Alignment, Offset), + Store->getMemOperand()->getFlags()); + Stores.push_back(Ch); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); +} + static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { StoreSDNode *St = cast(Op.getNode()); @@ -39640,6 +39676,15 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, return SDValue(); return splitVectorStore(St, DAG); } + + // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64 + // to use MOVNTI. + if (VT.is128BitVector() && Subtarget.hasSSE2()) { + MVT NTVT = Subtarget.hasSSE4A() + ? MVT::v2f64 + : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32); + return scalarizeVectorStore(St, NTVT, DAG); + } } // Optimize trunc store (of multiple scalars) to shuffle and store. diff --git a/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/test/CodeGen/X86/merge-consecutive-stores-nt.ll index 1ffdf01e6fd..b742cab5448 100644 --- a/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ b/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -298,67 +298,105 @@ define void @merge_2_v4f32_align1_ntload(<4 x float>* %a0, <4 x float>* %a1) nou ; Nothing can perform NT-store-vector on 1-byte aligned memory. ; Must be scalarized to use MOVTNI/MOVNTSD. define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind { -; X86-LABEL: merge_2_v4f32_align1_ntstore: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $48, %esp -; X86-NEXT: movl 12(%ebp), %eax -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movups (%ecx), %xmm0 -; X86-NEXT: movups 16(%ecx), %xmm1 -; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movntil %ecx, 12(%eax) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movntil %ecx, 8(%eax) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movntil %edx, 4(%eax) -; X86-NEXT: movntil %ecx, (%eax) -; X86-NEXT: movaps %xmm1, (%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movntil %ecx, 28(%eax) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movntil %ecx, 24(%eax) -; X86-NEXT: movl (%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movntil %edx, 20(%eax) -; X86-NEXT: movntil %ecx, 16(%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl +; X86-SSE2-LABEL: merge_2_v4f32_align1_ntstore: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: movntil %ecx, (%eax) +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movntil %ecx, 12(%eax) +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movntil %ecx, 8(%eax) +; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: movntil %ecx, 4(%eax) +; X86-SSE2-NEXT: movd %xmm1, %ecx +; X86-SSE2-NEXT: movntil %ecx, 16(%eax) +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: movntil %ecx, 28(%eax) +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: movntil %ecx, 24(%eax) +; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm1, %ecx +; X86-SSE2-NEXT: movntil %ecx, 20(%eax) +; X86-SSE2-NEXT: retl ; -; X64-SSE-LABEL: merge_2_v4f32_align1_ntstore: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movups (%rdi), %xmm0 -; X64-SSE-NEXT: movups 16(%rdi), %xmm1 -; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; X64-SSE-NEXT: movntiq %rcx, 8(%rsi) -; X64-SSE-NEXT: movntiq %rax, (%rsi) -; X64-SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; X64-SSE-NEXT: movntiq %rcx, 24(%rsi) -; X64-SSE-NEXT: movntiq %rax, 16(%rsi) -; X64-SSE-NEXT: retq +; X86-SSE4A-LABEL: merge_2_v4f32_align1_ntstore: +; X86-SSE4A: # %bb.0: +; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE4A-NEXT: movups (%ecx), %xmm0 +; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1 +; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) +; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; X86-SSE4A-NEXT: movntsd %xmm0, 8(%eax) +; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax) +; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; X86-SSE4A-NEXT: movntsd %xmm1, 24(%eax) +; X86-SSE4A-NEXT: retl +; +; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 +; X64-SSE2-NEXT: movq %xmm0, %rax +; X64-SSE2-NEXT: movntiq %rax, (%rsi) +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movq %xmm0, %rax +; X64-SSE2-NEXT: movntiq %rax, 8(%rsi) +; X64-SSE2-NEXT: movq %xmm1, %rax +; X64-SSE2-NEXT: movntiq %rax, 16(%rsi) +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movq %xmm0, %rax +; X64-SSE2-NEXT: movntiq %rax, 24(%rsi) +; X64-SSE2-NEXT: retq +; +; X64-SSE4A-LABEL: merge_2_v4f32_align1_ntstore: +; X64-SSE4A: # %bb.0: +; X64-SSE4A-NEXT: movups (%rdi), %xmm0 +; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1 +; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) +; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; X64-SSE4A-NEXT: movntsd %xmm0, 8(%rsi) +; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi) +; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; X64-SSE4A-NEXT: movntsd %xmm1, 24(%rsi) +; X64-SSE4A-NEXT: retq +; +; X64-SSE41-LABEL: merge_2_v4f32_align1_ntstore: +; X64-SSE41: # %bb.0: +; X64-SSE41-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1 +; X64-SSE41-NEXT: pextrq $1, %xmm0, %rax +; X64-SSE41-NEXT: movntiq %rax, 8(%rsi) +; X64-SSE41-NEXT: movq %xmm0, %rax +; X64-SSE41-NEXT: movntiq %rax, (%rsi) +; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax +; X64-SSE41-NEXT: movntiq %rax, 24(%rsi) +; X64-SSE41-NEXT: movq %xmm1, %rax +; X64-SSE41-NEXT: movntiq %rax, 16(%rsi) +; X64-SSE41-NEXT: retq ; ; X64-AVX-LABEL: merge_2_v4f32_align1_ntstore: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovups (%rdi), %xmm0 -; X64-AVX-NEXT: vmovups 16(%rdi), %xmm1 -; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; X64-AVX-NEXT: movntiq %rcx, 8(%rsi) +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm1 +; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX-NEXT: movntiq %rax, 8(%rsi) +; X64-AVX-NEXT: vmovq %xmm0, %rax ; X64-AVX-NEXT: movntiq %rax, (%rsi) -; X64-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; X64-AVX-NEXT: movntiq %rcx, 24(%rsi) +; X64-AVX-NEXT: vpextrq $1, %xmm1, %rax +; X64-AVX-NEXT: movntiq %rax, 24(%rsi) +; X64-AVX-NEXT: vmovq %xmm1, %rax ; X64-AVX-NEXT: movntiq %rax, 16(%rsi) ; X64-AVX-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 @@ -375,67 +413,105 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no ; Nothing can perform NT-load-vector on 1-byte aligned memory. ; Just perform regular loads and scalarize NT-stores. define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind { -; X86-LABEL: merge_2_v4f32_align1: -; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $48, %esp -; X86-NEXT: movl 12(%ebp), %eax -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movups (%ecx), %xmm0 -; X86-NEXT: movups 16(%ecx), %xmm1 -; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movntil %ecx, 12(%eax) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movntil %ecx, 8(%eax) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movntil %edx, 4(%eax) -; X86-NEXT: movntil %ecx, (%eax) -; X86-NEXT: movaps %xmm1, (%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movntil %ecx, 28(%eax) -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movntil %ecx, 24(%eax) -; X86-NEXT: movl (%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movntil %edx, 20(%eax) -; X86-NEXT: movntil %ecx, 16(%eax) -; X86-NEXT: movl %ebp, %esp -; X86-NEXT: popl %ebp -; X86-NEXT: retl +; X86-SSE2-LABEL: merge_2_v4f32_align1: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: movntil %ecx, (%eax) +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movntil %ecx, 12(%eax) +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; X86-SSE2-NEXT: movd %xmm2, %ecx +; X86-SSE2-NEXT: movntil %ecx, 8(%eax) +; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: movntil %ecx, 4(%eax) +; X86-SSE2-NEXT: movd %xmm1, %ecx +; X86-SSE2-NEXT: movntil %ecx, 16(%eax) +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: movntil %ecx, 28(%eax) +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X86-SSE2-NEXT: movd %xmm0, %ecx +; X86-SSE2-NEXT: movntil %ecx, 24(%eax) +; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; X86-SSE2-NEXT: movd %xmm1, %ecx +; X86-SSE2-NEXT: movntil %ecx, 20(%eax) +; X86-SSE2-NEXT: retl ; -; X64-SSE-LABEL: merge_2_v4f32_align1: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movups (%rdi), %xmm0 -; X64-SSE-NEXT: movups 16(%rdi), %xmm1 -; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; X64-SSE-NEXT: movntiq %rcx, 8(%rsi) -; X64-SSE-NEXT: movntiq %rax, (%rsi) -; X64-SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; X64-SSE-NEXT: movntiq %rcx, 24(%rsi) -; X64-SSE-NEXT: movntiq %rax, 16(%rsi) -; X64-SSE-NEXT: retq +; X86-SSE4A-LABEL: merge_2_v4f32_align1: +; X86-SSE4A: # %bb.0: +; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE4A-NEXT: movups (%ecx), %xmm0 +; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1 +; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) +; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; X86-SSE4A-NEXT: movntsd %xmm0, 8(%eax) +; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax) +; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; X86-SSE4A-NEXT: movntsd %xmm1, 24(%eax) +; X86-SSE4A-NEXT: retl +; +; X64-SSE2-LABEL: merge_2_v4f32_align1: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1 +; X64-SSE2-NEXT: movq %xmm0, %rax +; X64-SSE2-NEXT: movntiq %rax, (%rsi) +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-SSE2-NEXT: movq %xmm0, %rax +; X64-SSE2-NEXT: movntiq %rax, 8(%rsi) +; X64-SSE2-NEXT: movq %xmm1, %rax +; X64-SSE2-NEXT: movntiq %rax, 16(%rsi) +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; X64-SSE2-NEXT: movq %xmm0, %rax +; X64-SSE2-NEXT: movntiq %rax, 24(%rsi) +; X64-SSE2-NEXT: retq +; +; X64-SSE4A-LABEL: merge_2_v4f32_align1: +; X64-SSE4A: # %bb.0: +; X64-SSE4A-NEXT: movups (%rdi), %xmm0 +; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1 +; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) +; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; X64-SSE4A-NEXT: movntsd %xmm0, 8(%rsi) +; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi) +; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; X64-SSE4A-NEXT: movntsd %xmm1, 24(%rsi) +; X64-SSE4A-NEXT: retq +; +; X64-SSE41-LABEL: merge_2_v4f32_align1: +; X64-SSE41: # %bb.0: +; X64-SSE41-NEXT: movdqu (%rdi), %xmm0 +; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1 +; X64-SSE41-NEXT: pextrq $1, %xmm0, %rax +; X64-SSE41-NEXT: movntiq %rax, 8(%rsi) +; X64-SSE41-NEXT: movq %xmm0, %rax +; X64-SSE41-NEXT: movntiq %rax, (%rsi) +; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax +; X64-SSE41-NEXT: movntiq %rax, 24(%rsi) +; X64-SSE41-NEXT: movq %xmm1, %rax +; X64-SSE41-NEXT: movntiq %rax, 16(%rsi) +; X64-SSE41-NEXT: retq ; ; X64-AVX-LABEL: merge_2_v4f32_align1: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovups (%rdi), %xmm0 -; X64-AVX-NEXT: vmovups 16(%rdi), %xmm1 -; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; X64-AVX-NEXT: movntiq %rcx, 8(%rsi) +; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0 +; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm1 +; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax +; X64-AVX-NEXT: movntiq %rax, 8(%rsi) +; X64-AVX-NEXT: vmovq %xmm0, %rax ; X64-AVX-NEXT: movntiq %rax, (%rsi) -; X64-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; X64-AVX-NEXT: movntiq %rcx, 24(%rsi) +; X64-AVX-NEXT: vpextrq $1, %xmm1, %rax +; X64-AVX-NEXT: movntiq %rax, 24(%rsi) +; X64-AVX-NEXT: vmovq %xmm1, %rax ; X64-AVX-NEXT: movntiq %rax, 16(%rsi) ; X64-AVX-NEXT: retq %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0 diff --git a/test/CodeGen/X86/nontemporal-3.ll b/test/CodeGen/X86/nontemporal-3.ll index b2b57440717..f0a2f6fac1f 100644 --- a/test/CodeGen/X86/nontemporal-3.ll +++ b/test/CodeGen/X86/nontemporal-3.ll @@ -14,31 +14,22 @@ define void @test_zero_v2f64_align1(<2 x double>* %dst) nounwind { ; SSE-LABEL: test_zero_v2f64_align1: ; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 8(%rdi) ; SSE-NEXT: movntiq %rax, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v2f64_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v2f64_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <2 x double> zeroinitializer, <2 x double>* %dst, align 1, !nontemporal !1 @@ -46,33 +37,39 @@ define void @test_zero_v2f64_align1(<2 x double>* %dst) nounwind { } define void @test_zero_v4f32_align1(<4 x float>* %dst) nounwind { -; SSE-LABEL: test_zero_v4f32_align1: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_zero_v4f32_align1: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movntiq %rax, 8(%rdi) +; SSE2-NEXT: movntiq %rax, (%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_zero_v4f32_align1: +; SSE4A: # %bb.0: +; SSE4A-NEXT: xorl %eax, %eax +; SSE4A-NEXT: movntiq %rax, 8(%rdi) +; SSE4A-NEXT: xorps %xmm0, %xmm0 +; SSE4A-NEXT: movntsd %xmm0, (%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_zero_v4f32_align1: +; SSE41: # %bb.0: +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: movntiq %rax, 8(%rdi) +; SSE41-NEXT: movntiq %rax, (%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v4f32_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v4f32_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <4 x float> zeroinitializer, <4 x float>* %dst, align 1, !nontemporal !1 @@ -80,33 +77,39 @@ define void @test_zero_v4f32_align1(<4 x float>* %dst) nounwind { } define void @test_zero_v2i64_align1(<2 x i64>* %dst) nounwind { -; SSE-LABEL: test_zero_v2i64_align1: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_zero_v2i64_align1: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movntiq %rax, 8(%rdi) +; SSE2-NEXT: movntiq %rax, (%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_zero_v2i64_align1: +; SSE4A: # %bb.0: +; SSE4A-NEXT: xorl %eax, %eax +; SSE4A-NEXT: movntiq %rax, 8(%rdi) +; SSE4A-NEXT: xorps %xmm0, %xmm0 +; SSE4A-NEXT: movntsd %xmm0, (%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_zero_v2i64_align1: +; SSE41: # %bb.0: +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: movntiq %rax, 8(%rdi) +; SSE41-NEXT: movntiq %rax, (%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v2i64_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v2i64_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 1, !nontemporal !1 @@ -114,33 +117,39 @@ define void @test_zero_v2i64_align1(<2 x i64>* %dst) nounwind { } define void @test_zero_v4i32_align1(<4 x i32>* %dst) nounwind { -; SSE-LABEL: test_zero_v4i32_align1: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_zero_v4i32_align1: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movntiq %rax, 8(%rdi) +; SSE2-NEXT: movntiq %rax, (%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_zero_v4i32_align1: +; SSE4A: # %bb.0: +; SSE4A-NEXT: xorl %eax, %eax +; SSE4A-NEXT: movntiq %rax, 8(%rdi) +; SSE4A-NEXT: xorps %xmm0, %xmm0 +; SSE4A-NEXT: movntsd %xmm0, (%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_zero_v4i32_align1: +; SSE41: # %bb.0: +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: movntiq %rax, 8(%rdi) +; SSE41-NEXT: movntiq %rax, (%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v4i32_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v4i32_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 1, !nontemporal !1 @@ -148,33 +157,39 @@ define void @test_zero_v4i32_align1(<4 x i32>* %dst) nounwind { } define void @test_zero_v8i16_align1(<8 x i16>* %dst) nounwind { -; SSE-LABEL: test_zero_v8i16_align1: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_zero_v8i16_align1: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movntiq %rax, 8(%rdi) +; SSE2-NEXT: movntiq %rax, (%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_zero_v8i16_align1: +; SSE4A: # %bb.0: +; SSE4A-NEXT: xorl %eax, %eax +; SSE4A-NEXT: movntiq %rax, 8(%rdi) +; SSE4A-NEXT: xorps %xmm0, %xmm0 +; SSE4A-NEXT: movntsd %xmm0, (%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_zero_v8i16_align1: +; SSE41: # %bb.0: +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: movntiq %rax, 8(%rdi) +; SSE41-NEXT: movntiq %rax, (%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v8i16_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8i16_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 1, !nontemporal !1 @@ -182,33 +197,39 @@ define void @test_zero_v8i16_align1(<8 x i16>* %dst) nounwind { } define void @test_zero_v16i8_align1(<16 x i8>* %dst) nounwind { -; SSE-LABEL: test_zero_v16i8_align1: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_zero_v16i8_align1: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movntiq %rax, 8(%rdi) +; SSE2-NEXT: movntiq %rax, (%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_zero_v16i8_align1: +; SSE4A: # %bb.0: +; SSE4A-NEXT: xorl %eax, %eax +; SSE4A-NEXT: movntiq %rax, 8(%rdi) +; SSE4A-NEXT: xorps %xmm0, %xmm0 +; SSE4A-NEXT: movntsd %xmm0, (%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_zero_v16i8_align1: +; SSE41: # %bb.0: +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: movntiq %rax, 8(%rdi) +; SSE41-NEXT: movntiq %rax, (%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v16i8_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v16i8_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) ; AVX512-NEXT: retq store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 1, !nontemporal !1 @@ -220,292 +241,274 @@ define void @test_zero_v16i8_align1(<16 x i8>* %dst) nounwind { define void @test_zero_v4f64_align1(<4 x double>* %dst) nounwind { ; SSE-LABEL: test_zero_v4f64_align1: ; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 8(%rdi) ; SSE-NEXT: movntiq %rax, (%rdi) +; SSE-NEXT: movntiq %rax, 24(%rdi) +; SSE-NEXT: movntiq %rax, 16(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v4f64_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v4f64_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: retq store <4 x double> zeroinitializer, <4 x double>* %dst, align 1, !nontemporal !1 ret void } define void @test_zero_v8f32_align1(<8 x float>* %dst) nounwind { -; SSE-LABEL: test_zero_v8f32_align1: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_zero_v8f32_align1: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movntiq %rax, 8(%rdi) +; SSE2-NEXT: movntiq %rax, (%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_zero_v8f32_align1: +; SSE4A: # %bb.0: +; SSE4A-NEXT: xorps %xmm0, %xmm0 +; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) +; SSE4A-NEXT: movntsd %xmm0, (%rdi) +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_zero_v8f32_align1: +; SSE41: # %bb.0: +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: movntiq %rax, 8(%rdi) +; SSE41-NEXT: movntiq %rax, (%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v8f32_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8f32_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: retq store <8 x float> zeroinitializer, <8 x float>* %dst, align 1, !nontemporal !1 ret void } define void @test_zero_v4i64_align1(<4 x i64>* %dst) nounwind { -; SSE-LABEL: test_zero_v4i64_align1: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_zero_v4i64_align1: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movntiq %rax, 8(%rdi) +; SSE2-NEXT: movntiq %rax, (%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_zero_v4i64_align1: +; SSE4A: # %bb.0: +; SSE4A-NEXT: xorps %xmm0, %xmm0 +; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) +; SSE4A-NEXT: movntsd %xmm0, (%rdi) +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_zero_v4i64_align1: +; SSE41: # %bb.0: +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: movntiq %rax, 8(%rdi) +; SSE41-NEXT: movntiq %rax, (%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v4i64_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v4i64_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: retq store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 1, !nontemporal !1 ret void } define void @test_zero_v8i32_align1(<8 x i32>* %dst) nounwind { -; SSE-LABEL: test_zero_v8i32_align1: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_zero_v8i32_align1: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movntiq %rax, 8(%rdi) +; SSE2-NEXT: movntiq %rax, (%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_zero_v8i32_align1: +; SSE4A: # %bb.0: +; SSE4A-NEXT: xorps %xmm0, %xmm0 +; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) +; SSE4A-NEXT: movntsd %xmm0, (%rdi) +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_zero_v8i32_align1: +; SSE41: # %bb.0: +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: movntiq %rax, 8(%rdi) +; SSE41-NEXT: movntiq %rax, (%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v8i32_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8i32_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: retq store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 1, !nontemporal !1 ret void } define void @test_zero_v16i16_align1(<16 x i16>* %dst) nounwind { -; SSE-LABEL: test_zero_v16i16_align1: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_zero_v16i16_align1: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movntiq %rax, 8(%rdi) +; SSE2-NEXT: movntiq %rax, (%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_zero_v16i16_align1: +; SSE4A: # %bb.0: +; SSE4A-NEXT: xorps %xmm0, %xmm0 +; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) +; SSE4A-NEXT: movntsd %xmm0, (%rdi) +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_zero_v16i16_align1: +; SSE41: # %bb.0: +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: movntiq %rax, 8(%rdi) +; SSE41-NEXT: movntiq %rax, (%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v16i16_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v16i16_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: retq store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 1, !nontemporal !1 ret void } define void @test_zero_v32i8_align1(<32 x i8>* %dst) nounwind { -; SSE-LABEL: test_zero_v32i8_align1: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_zero_v32i8_align1: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movntiq %rax, 8(%rdi) +; SSE2-NEXT: movntiq %rax, (%rdi) +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_zero_v32i8_align1: +; SSE4A: # %bb.0: +; SSE4A-NEXT: xorps %xmm0, %xmm0 +; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) +; SSE4A-NEXT: movntsd %xmm0, (%rdi) +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_zero_v32i8_align1: +; SSE41: # %bb.0: +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: movntiq %rax, 8(%rdi) +; SSE41-NEXT: movntiq %rax, (%rdi) +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v32i8_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 24(%rdi) -; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) +; AVX-NEXT: movntiq %rax, 24(%rdi) +; AVX-NEXT: movntiq %rax, 16(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v32i8_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 24(%rdi) -; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) +; AVX512-NEXT: movntiq %rax, 24(%rdi) +; AVX512-NEXT: movntiq %rax, 16(%rdi) ; AVX512-NEXT: retq store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 1, !nontemporal !1 ret void @@ -666,76 +669,40 @@ define void @test_zero_v32i8_align16(<32 x i8>* %dst) nounwind { define void @test_zero_v8f64_align1(<8 x double>* %dst) nounwind { ; SSE-LABEL: test_zero_v8f64_align1: ; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) +; SSE-NEXT: xorl %eax, %eax +; SSE-NEXT: movntiq %rax, 24(%rdi) ; SSE-NEXT: movntiq %rax, 16(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) +; SSE-NEXT: movntiq %rax, 8(%rdi) ; SSE-NEXT: movntiq %rax, (%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 56(%rdi) +; SSE-NEXT: movntiq %rax, 56(%rdi) ; SSE-NEXT: movntiq %rax, 48(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 40(%rdi) +; SSE-NEXT: movntiq %rax, 40(%rdi) ; SSE-NEXT: movntiq %rax, 32(%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: test_zero_v8f64_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 24(%rdi) ; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 56(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) ; AVX-NEXT: movntiq %rax, 48(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 40(%rdi) +; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8f64_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 24(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 24(%rdi) ; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) ; AVX512-NEXT: movntiq %rax, 48(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 40(%rdi) +; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) ; AVX512-NEXT: retq store <8 x double> zeroinitializer, <8 x double>* %dst, align 1, !nontemporal !1 @@ -743,78 +710,68 @@ define void @test_zero_v8f64_align1(<8 x double>* %dst) nounwind { } define void @test_zero_v16f32_align1(<16 x float>* %dst) nounwind { -; SSE-LABEL: test_zero_v16f32_align1: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 56(%rdi) -; SSE-NEXT: movntiq %rax, 48(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 40(%rdi) -; SSE-NEXT: movntiq %rax, 32(%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_zero_v16f32_align1: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) +; SSE2-NEXT: movntiq %rax, 8(%rdi) +; SSE2-NEXT: movntiq %rax, (%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 40(%rdi) +; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_zero_v16f32_align1: +; SSE4A: # %bb.0: +; SSE4A-NEXT: xorps %xmm0, %xmm0 +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) +; SSE4A-NEXT: movntsd %xmm0, (%rdi) +; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_zero_v16f32_align1: +; SSE41: # %bb.0: +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) +; SSE41-NEXT: movntiq %rax, 8(%rdi) +; SSE41-NEXT: movntiq %rax, (%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 40(%rdi) +; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v16f32_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 24(%rdi) ; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 56(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) ; AVX-NEXT: movntiq %rax, 48(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 40(%rdi) +; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v16f32_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 24(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 24(%rdi) ; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) ; AVX512-NEXT: movntiq %rax, 48(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 40(%rdi) +; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) ; AVX512-NEXT: retq store <16 x float> zeroinitializer, <16 x float>* %dst, align 1, !nontemporal !1 @@ -822,78 +779,68 @@ define void @test_zero_v16f32_align1(<16 x float>* %dst) nounwind { } define void @test_zero_v8i64_align1(<8 x i64>* %dst) nounwind { -; SSE-LABEL: test_zero_v8i64_align1: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 56(%rdi) -; SSE-NEXT: movntiq %rax, 48(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 40(%rdi) -; SSE-NEXT: movntiq %rax, 32(%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_zero_v8i64_align1: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) +; SSE2-NEXT: movntiq %rax, 8(%rdi) +; SSE2-NEXT: movntiq %rax, (%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 40(%rdi) +; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_zero_v8i64_align1: +; SSE4A: # %bb.0: +; SSE4A-NEXT: xorps %xmm0, %xmm0 +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) +; SSE4A-NEXT: movntsd %xmm0, (%rdi) +; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_zero_v8i64_align1: +; SSE41: # %bb.0: +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) +; SSE41-NEXT: movntiq %rax, 8(%rdi) +; SSE41-NEXT: movntiq %rax, (%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 40(%rdi) +; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v8i64_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 24(%rdi) ; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 56(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) ; AVX-NEXT: movntiq %rax, 48(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 40(%rdi) +; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v8i64_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 24(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 24(%rdi) ; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) ; AVX512-NEXT: movntiq %rax, 48(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 40(%rdi) +; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) ; AVX512-NEXT: retq store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 1, !nontemporal !1 @@ -901,78 +848,68 @@ define void @test_zero_v8i64_align1(<8 x i64>* %dst) nounwind { } define void @test_zero_v16i32_align1(<16 x i32>* %dst) nounwind { -; SSE-LABEL: test_zero_v16i32_align1: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 56(%rdi) -; SSE-NEXT: movntiq %rax, 48(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 40(%rdi) -; SSE-NEXT: movntiq %rax, 32(%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_zero_v16i32_align1: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) +; SSE2-NEXT: movntiq %rax, 8(%rdi) +; SSE2-NEXT: movntiq %rax, (%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 40(%rdi) +; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_zero_v16i32_align1: +; SSE4A: # %bb.0: +; SSE4A-NEXT: xorps %xmm0, %xmm0 +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) +; SSE4A-NEXT: movntsd %xmm0, (%rdi) +; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_zero_v16i32_align1: +; SSE41: # %bb.0: +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) +; SSE41-NEXT: movntiq %rax, 8(%rdi) +; SSE41-NEXT: movntiq %rax, (%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 40(%rdi) +; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v16i32_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 24(%rdi) ; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 56(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) ; AVX-NEXT: movntiq %rax, 48(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 40(%rdi) +; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v16i32_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 24(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 24(%rdi) ; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) ; AVX512-NEXT: movntiq %rax, 48(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 40(%rdi) +; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) ; AVX512-NEXT: retq store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 1, !nontemporal !1 @@ -980,78 +917,68 @@ define void @test_zero_v16i32_align1(<16 x i32>* %dst) nounwind { } define void @test_zero_v32i16_align1(<32 x i16>* %dst) nounwind { -; SSE-LABEL: test_zero_v32i16_align1: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 56(%rdi) -; SSE-NEXT: movntiq %rax, 48(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 40(%rdi) -; SSE-NEXT: movntiq %rax, 32(%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_zero_v32i16_align1: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) +; SSE2-NEXT: movntiq %rax, 8(%rdi) +; SSE2-NEXT: movntiq %rax, (%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 40(%rdi) +; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_zero_v32i16_align1: +; SSE4A: # %bb.0: +; SSE4A-NEXT: xorps %xmm0, %xmm0 +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) +; SSE4A-NEXT: movntsd %xmm0, (%rdi) +; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_zero_v32i16_align1: +; SSE41: # %bb.0: +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) +; SSE41-NEXT: movntiq %rax, 8(%rdi) +; SSE41-NEXT: movntiq %rax, (%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 40(%rdi) +; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v32i16_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 24(%rdi) ; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 56(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) ; AVX-NEXT: movntiq %rax, 48(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 40(%rdi) +; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v32i16_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 24(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 24(%rdi) ; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) ; AVX512-NEXT: movntiq %rax, 48(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 40(%rdi) +; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) ; AVX512-NEXT: retq store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 1, !nontemporal !1 @@ -1059,78 +986,68 @@ define void @test_zero_v32i16_align1(<32 x i16>* %dst) nounwind { } define void @test_zero_v64i8_align1(<64 x i8>* %dst) nounwind { -; SSE-LABEL: test_zero_v64i8_align1: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 24(%rdi) -; SSE-NEXT: movntiq %rax, 16(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 8(%rdi) -; SSE-NEXT: movntiq %rax, (%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 56(%rdi) -; SSE-NEXT: movntiq %rax, 48(%rdi) -; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; SSE-NEXT: movntiq %rcx, 40(%rdi) -; SSE-NEXT: movntiq %rax, 32(%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_zero_v64i8_align1: +; SSE2: # %bb.0: +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: movntiq %rax, 24(%rdi) +; SSE2-NEXT: movntiq %rax, 16(%rdi) +; SSE2-NEXT: movntiq %rax, 8(%rdi) +; SSE2-NEXT: movntiq %rax, (%rdi) +; SSE2-NEXT: movntiq %rax, 56(%rdi) +; SSE2-NEXT: movntiq %rax, 48(%rdi) +; SSE2-NEXT: movntiq %rax, 40(%rdi) +; SSE2-NEXT: movntiq %rax, 32(%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_zero_v64i8_align1: +; SSE4A: # %bb.0: +; SSE4A-NEXT: xorps %xmm0, %xmm0 +; SSE4A-NEXT: movntsd %xmm0, 24(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 16(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 8(%rdi) +; SSE4A-NEXT: movntsd %xmm0, (%rdi) +; SSE4A-NEXT: movntsd %xmm0, 56(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 48(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 40(%rdi) +; SSE4A-NEXT: movntsd %xmm0, 32(%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_zero_v64i8_align1: +; SSE41: # %bb.0: +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: movntiq %rax, 24(%rdi) +; SSE41-NEXT: movntiq %rax, 16(%rdi) +; SSE41-NEXT: movntiq %rax, 8(%rdi) +; SSE41-NEXT: movntiq %rax, (%rdi) +; SSE41-NEXT: movntiq %rax, 56(%rdi) +; SSE41-NEXT: movntiq %rax, 48(%rdi) +; SSE41-NEXT: movntiq %rax, 40(%rdi) +; SSE41-NEXT: movntiq %rax, 32(%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_zero_v64i8_align1: ; AVX: # %bb.0: -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 24(%rdi) +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: movntiq %rax, 24(%rdi) ; AVX-NEXT: movntiq %rax, 16(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 8(%rdi) +; AVX-NEXT: movntiq %rax, 8(%rdi) ; AVX-NEXT: movntiq %rax, (%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 56(%rdi) +; AVX-NEXT: movntiq %rax, 56(%rdi) ; AVX-NEXT: movntiq %rax, 48(%rdi) -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movntiq %rcx, 40(%rdi) +; AVX-NEXT: movntiq %rax, 40(%rdi) ; AVX-NEXT: movntiq %rax, 32(%rdi) ; AVX-NEXT: retq ; ; AVX512-LABEL: test_zero_v64i8_align1: ; AVX512: # %bb.0: -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 24(%rdi) +; AVX512-NEXT: xorl %eax, %eax +; AVX512-NEXT: movntiq %rax, 24(%rdi) ; AVX512-NEXT: movntiq %rax, 16(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 8(%rdi) +; AVX512-NEXT: movntiq %rax, 8(%rdi) ; AVX512-NEXT: movntiq %rax, (%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 56(%rdi) +; AVX512-NEXT: movntiq %rax, 56(%rdi) ; AVX512-NEXT: movntiq %rax, 48(%rdi) -; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX512-NEXT: movntiq %rcx, 40(%rdi) +; AVX512-NEXT: movntiq %rax, 40(%rdi) ; AVX512-NEXT: movntiq %rax, 32(%rdi) ; AVX512-NEXT: retq store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 1, !nontemporal !1