From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 17 Jun 2019 18:20:04 +0000 (+0000)
Subject: [X86][SSE] Scalarize under-aligned XMM vector nt-stores (PR42026)
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=22af01a41402861dd1eaf9a2210a247b7c5a7f65;p=llvm

[X86][SSE] Scalarize under-aligned XMM vector nt-stores (PR42026)

If a XMM non-temporal store has less than natural alignment, scalarize the vector - with SSE4A we can stay on the vector and use MOVNTSD(f64), else we must move to GPRs and use MOVNTI(i32/i64).

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363592 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index c9b8e5fa2c0..42fcb5e92e9 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -21110,6 +21110,42 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
 }
 
+/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
+/// type.
+static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
+                                    SelectionDAG &DAG) {
+  SDValue StoredVal = Store->getValue();
+  assert(StoreVT.is128BitVector() &&
+         StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
+  StoredVal = DAG.getBitcast(StoreVT, StoredVal);
+
+  // Splitting volatile memory ops is not allowed unless the operation was not
+  // legal to begin with. We are assuming the input op is legal (this transform
+  // is only used for targets with AVX).
+  if (Store->isVolatile())
+    return SDValue();
+
+  MVT StoreSVT = StoreVT.getScalarType();
+  unsigned NumElems = StoreVT.getVectorNumElements();
+  unsigned ScalarSize = StoreSVT.getStoreSize();
+  unsigned Alignment = Store->getAlignment();
+
+  SDLoc DL(Store);
+  SmallVector<SDValue, 4> Stores;
+  for (unsigned i = 0; i != NumElems; ++i) {
+    unsigned Offset = i * ScalarSize;
+    SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
+    SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
+                              DAG.getIntPtrConstant(i, DL));
+    SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
+                              Store->getPointerInfo().getWithOffset(Offset),
+                              MinAlign(Alignment, Offset),
+                              Store->getMemOperand()->getFlags());
+    Stores.push_back(Ch);
+  }
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
+}
+
 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
   StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
@@ -39640,6 +39676,15 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
         return SDValue();
       return splitVectorStore(St, DAG);
     }
+
+    // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
+    // to use MOVNTI.
+    if (VT.is128BitVector() && Subtarget.hasSSE2()) {
+      MVT NTVT = Subtarget.hasSSE4A()
+                     ? MVT::v2f64
+                     : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
+      return scalarizeVectorStore(St, NTVT, DAG);
+    }
   }
 
   // Optimize trunc store (of multiple scalars) to shuffle and store.
diff --git a/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/test/CodeGen/X86/merge-consecutive-stores-nt.ll
index 1ffdf01e6fd..b742cab5448 100644
--- a/test/CodeGen/X86/merge-consecutive-stores-nt.ll
+++ b/test/CodeGen/X86/merge-consecutive-stores-nt.ll
@@ -298,67 +298,105 @@ define void @merge_2_v4f32_align1_ntload(<4 x float>* %a0, <4 x float>* %a1) nou
 ; Nothing can perform NT-store-vector on 1-byte aligned memory.
 ; Must be scalarized to use MOVTNI/MOVNTSD.
 define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind {
-; X86-LABEL: merge_2_v4f32_align1_ntstore:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $48, %esp
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movups (%ecx), %xmm0
-; X86-NEXT:    movups 16(%ecx), %xmm1
-; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movntil %ecx, 12(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movntil %ecx, 8(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movntil %edx, 4(%eax)
-; X86-NEXT:    movntil %ecx, (%eax)
-; X86-NEXT:    movaps %xmm1, (%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movntil %ecx, 28(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movntil %ecx, 24(%eax)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movntil %edx, 20(%eax)
-; X86-NEXT:    movntil %ecx, 16(%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
+; X86-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, (%eax)
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3]
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 12(%eax)
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 8(%eax)
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 4(%eax)
+; X86-SSE2-NEXT:    movd %xmm1, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 16(%eax)
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 28(%eax)
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 24(%eax)
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm1, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 20(%eax)
+; X86-SSE2-NEXT:    retl
 ;
-; X64-SSE-LABEL: merge_2_v4f32_align1_ntstore:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movups (%rdi), %xmm0
-; X64-SSE-NEXT:    movups 16(%rdi), %xmm1
-; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; X64-SSE-NEXT:    movntiq %rcx, 8(%rsi)
-; X64-SSE-NEXT:    movntiq %rax, (%rsi)
-; X64-SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; X64-SSE-NEXT:    movntiq %rcx, 24(%rsi)
-; X64-SSE-NEXT:    movntiq %rax, 16(%rsi)
-; X64-SSE-NEXT:    retq
+; X86-SSE4A-LABEL: merge_2_v4f32_align1_ntstore:
+; X86-SSE4A:       # %bb.0:
+; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE4A-NEXT:    movups (%ecx), %xmm0
+; X86-SSE4A-NEXT:    movups 16(%ecx), %xmm1
+; X86-SSE4A-NEXT:    movntsd %xmm0, (%eax)
+; X86-SSE4A-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X86-SSE4A-NEXT:    movntsd %xmm0, 8(%eax)
+; X86-SSE4A-NEXT:    movntsd %xmm1, 16(%eax)
+; X86-SSE4A-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; X86-SSE4A-NEXT:    movntsd %xmm1, 24(%eax)
+; X86-SSE4A-NEXT:    retl
+;
+; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-SSE2-NEXT:    movq %xmm0, %rax
+; X64-SSE2-NEXT:    movntiq %rax, (%rsi)
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movq %xmm0, %rax
+; X64-SSE2-NEXT:    movntiq %rax, 8(%rsi)
+; X64-SSE2-NEXT:    movq %xmm1, %rax
+; X64-SSE2-NEXT:    movntiq %rax, 16(%rsi)
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT:    movq %xmm0, %rax
+; X64-SSE2-NEXT:    movntiq %rax, 24(%rsi)
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE4A-LABEL: merge_2_v4f32_align1_ntstore:
+; X64-SSE4A:       # %bb.0:
+; X64-SSE4A-NEXT:    movups (%rdi), %xmm0
+; X64-SSE4A-NEXT:    movups 16(%rdi), %xmm1
+; X64-SSE4A-NEXT:    movntsd %xmm0, (%rsi)
+; X64-SSE4A-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-SSE4A-NEXT:    movntsd %xmm0, 8(%rsi)
+; X64-SSE4A-NEXT:    movntsd %xmm1, 16(%rsi)
+; X64-SSE4A-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; X64-SSE4A-NEXT:    movntsd %xmm1, 24(%rsi)
+; X64-SSE4A-NEXT:    retq
+;
+; X64-SSE41-LABEL: merge_2_v4f32_align1_ntstore:
+; X64-SSE41:       # %bb.0:
+; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; X64-SSE41-NEXT:    movntiq %rax, 8(%rsi)
+; X64-SSE41-NEXT:    movq %xmm0, %rax
+; X64-SSE41-NEXT:    movntiq %rax, (%rsi)
+; X64-SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; X64-SSE41-NEXT:    movntiq %rax, 24(%rsi)
+; X64-SSE41-NEXT:    movq %xmm1, %rax
+; X64-SSE41-NEXT:    movntiq %rax, 16(%rsi)
+; X64-SSE41-NEXT:    retq
 ;
 ; X64-AVX-LABEL: merge_2_v4f32_align1_ntstore:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovups (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovups 16(%rdi), %xmm1
-; X64-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; X64-AVX-NEXT:    movntiq %rcx, 8(%rsi)
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
+; X64-AVX-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX-NEXT:    movntiq %rax, 8(%rsi)
+; X64-AVX-NEXT:    vmovq %xmm0, %rax
 ; X64-AVX-NEXT:    movntiq %rax, (%rsi)
-; X64-AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; X64-AVX-NEXT:    movntiq %rcx, 24(%rsi)
+; X64-AVX-NEXT:    vpextrq $1, %xmm1, %rax
+; X64-AVX-NEXT:    movntiq %rax, 24(%rsi)
+; X64-AVX-NEXT:    vmovq %xmm1, %rax
 ; X64-AVX-NEXT:    movntiq %rax, 16(%rsi)
 ; X64-AVX-NEXT:    retq
   %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
@@ -375,67 +413,105 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no
 ; Nothing can perform NT-load-vector on 1-byte aligned memory.
 ; Just perform regular loads and scalarize NT-stores.
 define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
-; X86-LABEL: merge_2_v4f32_align1:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $48, %esp
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movups (%ecx), %xmm0
-; X86-NEXT:    movups 16(%ecx), %xmm1
-; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movntil %ecx, 12(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movntil %ecx, 8(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movntil %edx, 4(%eax)
-; X86-NEXT:    movntil %ecx, (%eax)
-; X86-NEXT:    movaps %xmm1, (%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movntil %ecx, 28(%eax)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movntil %ecx, 24(%eax)
-; X86-NEXT:    movl (%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movntil %edx, 20(%eax)
-; X86-NEXT:    movntil %ecx, 16(%eax)
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
+; X86-SSE2-LABEL: merge_2_v4f32_align1:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, (%eax)
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3]
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 12(%eax)
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 8(%eax)
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 4(%eax)
+; X86-SSE2-NEXT:    movd %xmm1, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 16(%eax)
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 28(%eax)
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 24(%eax)
+; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm1, %ecx
+; X86-SSE2-NEXT:    movntil %ecx, 20(%eax)
+; X86-SSE2-NEXT:    retl
 ;
-; X64-SSE-LABEL: merge_2_v4f32_align1:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movups (%rdi), %xmm0
-; X64-SSE-NEXT:    movups 16(%rdi), %xmm1
-; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; X64-SSE-NEXT:    movntiq %rcx, 8(%rsi)
-; X64-SSE-NEXT:    movntiq %rax, (%rsi)
-; X64-SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; X64-SSE-NEXT:    movntiq %rcx, 24(%rsi)
-; X64-SSE-NEXT:    movntiq %rax, 16(%rsi)
-; X64-SSE-NEXT:    retq
+; X86-SSE4A-LABEL: merge_2_v4f32_align1:
+; X86-SSE4A:       # %bb.0:
+; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE4A-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE4A-NEXT:    movups (%ecx), %xmm0
+; X86-SSE4A-NEXT:    movups 16(%ecx), %xmm1
+; X86-SSE4A-NEXT:    movntsd %xmm0, (%eax)
+; X86-SSE4A-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X86-SSE4A-NEXT:    movntsd %xmm0, 8(%eax)
+; X86-SSE4A-NEXT:    movntsd %xmm1, 16(%eax)
+; X86-SSE4A-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; X86-SSE4A-NEXT:    movntsd %xmm1, 24(%eax)
+; X86-SSE4A-NEXT:    retl
+;
+; X64-SSE2-LABEL: merge_2_v4f32_align1:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-SSE2-NEXT:    movq %xmm0, %rax
+; X64-SSE2-NEXT:    movntiq %rax, (%rsi)
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movq %xmm0, %rax
+; X64-SSE2-NEXT:    movntiq %rax, 8(%rsi)
+; X64-SSE2-NEXT:    movq %xmm1, %rax
+; X64-SSE2-NEXT:    movntiq %rax, 16(%rsi)
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT:    movq %xmm0, %rax
+; X64-SSE2-NEXT:    movntiq %rax, 24(%rsi)
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE4A-LABEL: merge_2_v4f32_align1:
+; X64-SSE4A:       # %bb.0:
+; X64-SSE4A-NEXT:    movups (%rdi), %xmm0
+; X64-SSE4A-NEXT:    movups 16(%rdi), %xmm1
+; X64-SSE4A-NEXT:    movntsd %xmm0, (%rsi)
+; X64-SSE4A-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-SSE4A-NEXT:    movntsd %xmm0, 8(%rsi)
+; X64-SSE4A-NEXT:    movntsd %xmm1, 16(%rsi)
+; X64-SSE4A-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; X64-SSE4A-NEXT:    movntsd %xmm1, 24(%rsi)
+; X64-SSE4A-NEXT:    retq
+;
+; X64-SSE41-LABEL: merge_2_v4f32_align1:
+; X64-SSE41:       # %bb.0:
+; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-SSE41-NEXT:    pextrq $1, %xmm0, %rax
+; X64-SSE41-NEXT:    movntiq %rax, 8(%rsi)
+; X64-SSE41-NEXT:    movq %xmm0, %rax
+; X64-SSE41-NEXT:    movntiq %rax, (%rsi)
+; X64-SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; X64-SSE41-NEXT:    movntiq %rax, 24(%rsi)
+; X64-SSE41-NEXT:    movq %xmm1, %rax
+; X64-SSE41-NEXT:    movntiq %rax, 16(%rsi)
+; X64-SSE41-NEXT:    retq
 ;
 ; X64-AVX-LABEL: merge_2_v4f32_align1:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovups (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovups 16(%rdi), %xmm1
-; X64-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; X64-AVX-NEXT:    movntiq %rcx, 8(%rsi)
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
+; X64-AVX-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX-NEXT:    movntiq %rax, 8(%rsi)
+; X64-AVX-NEXT:    vmovq %xmm0, %rax
 ; X64-AVX-NEXT:    movntiq %rax, (%rsi)
-; X64-AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; X64-AVX-NEXT:    movntiq %rcx, 24(%rsi)
+; X64-AVX-NEXT:    vpextrq $1, %xmm1, %rax
+; X64-AVX-NEXT:    movntiq %rax, 24(%rsi)
+; X64-AVX-NEXT:    vmovq %xmm1, %rax
 ; X64-AVX-NEXT:    movntiq %rax, 16(%rsi)
 ; X64-AVX-NEXT:    retq
   %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
diff --git a/test/CodeGen/X86/nontemporal-3.ll b/test/CodeGen/X86/nontemporal-3.ll
index b2b57440717..f0a2f6fac1f 100644
--- a/test/CodeGen/X86/nontemporal-3.ll
+++ b/test/CodeGen/X86/nontemporal-3.ll
@@ -14,31 +14,22 @@
 define void @test_zero_v2f64_align1(<2 x double>* %dst) nounwind {
 ; SSE-LABEL: test_zero_v2f64_align1:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    movntiq %rax, 8(%rdi)
 ; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v2f64_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v2f64_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <2 x double> zeroinitializer, <2 x double>* %dst, align 1, !nontemporal !1
@@ -46,33 +37,39 @@ define void @test_zero_v2f64_align1(<2 x double>* %dst) nounwind {
 }
 
 define void @test_zero_v4f32_align1(<4 x float>* %dst) nounwind {
-; SSE-LABEL: test_zero_v4f32_align1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_zero_v4f32_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v4f32_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorl %eax, %eax
+; SSE4A-NEXT:    movntiq %rax, 8(%rdi)
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v4f32_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v4f32_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v4f32_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <4 x float> zeroinitializer, <4 x float>* %dst, align 1, !nontemporal !1
@@ -80,33 +77,39 @@ define void @test_zero_v4f32_align1(<4 x float>* %dst) nounwind {
 }
 
 define void @test_zero_v2i64_align1(<2 x i64>* %dst) nounwind {
-; SSE-LABEL: test_zero_v2i64_align1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_zero_v2i64_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v2i64_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorl %eax, %eax
+; SSE4A-NEXT:    movntiq %rax, 8(%rdi)
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v2i64_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v2i64_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v2i64_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 1, !nontemporal !1
@@ -114,33 +117,39 @@ define void @test_zero_v2i64_align1(<2 x i64>* %dst) nounwind {
 }
 
 define void @test_zero_v4i32_align1(<4 x i32>* %dst) nounwind {
-; SSE-LABEL: test_zero_v4i32_align1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_zero_v4i32_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v4i32_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorl %eax, %eax
+; SSE4A-NEXT:    movntiq %rax, 8(%rdi)
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v4i32_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v4i32_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v4i32_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 1, !nontemporal !1
@@ -148,33 +157,39 @@ define void @test_zero_v4i32_align1(<4 x i32>* %dst) nounwind {
 }
 
 define void @test_zero_v8i16_align1(<8 x i16>* %dst) nounwind {
-; SSE-LABEL: test_zero_v8i16_align1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_zero_v8i16_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v8i16_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorl %eax, %eax
+; SSE4A-NEXT:    movntiq %rax, 8(%rdi)
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v8i16_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v8i16_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v8i16_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 1, !nontemporal !1
@@ -182,33 +197,39 @@ define void @test_zero_v8i16_align1(<8 x i16>* %dst) nounwind {
 }
 
 define void @test_zero_v16i8_align1(<16 x i8>* %dst) nounwind {
-; SSE-LABEL: test_zero_v16i8_align1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_zero_v16i8_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v16i8_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorl %eax, %eax
+; SSE4A-NEXT:    movntiq %rax, 8(%rdi)
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v16i8_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v16i8_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v16i8_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 1, !nontemporal !1
@@ -220,292 +241,274 @@ define void @test_zero_v16i8_align1(<16 x i8>* %dst) nounwind {
 define void @test_zero_v4f64_align1(<4 x double>* %dst) nounwind {
 ; SSE-LABEL: test_zero_v4f64_align1:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
-; SSE-NEXT:    movntiq %rax, 16(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    movntiq %rax, 8(%rdi)
 ; SSE-NEXT:    movntiq %rax, (%rdi)
+; SSE-NEXT:    movntiq %rax, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v4f64_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 24(%rdi)
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v4f64_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
 ; AVX512-NEXT:    retq
   store <4 x double> zeroinitializer, <4 x double>* %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v8f32_align1(<8 x float>* %dst) nounwind {
-; SSE-LABEL: test_zero_v8f32_align1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
-; SSE-NEXT:    movntiq %rax, 16(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_zero_v8f32_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v8f32_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v8f32_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v8f32_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 24(%rdi)
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v8f32_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
 ; AVX512-NEXT:    retq
   store <8 x float> zeroinitializer, <8 x float>* %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v4i64_align1(<4 x i64>* %dst) nounwind {
-; SSE-LABEL: test_zero_v4i64_align1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
-; SSE-NEXT:    movntiq %rax, 16(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_zero_v4i64_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v4i64_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v4i64_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v4i64_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 24(%rdi)
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v4i64_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
 ; AVX512-NEXT:    retq
   store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v8i32_align1(<8 x i32>* %dst) nounwind {
-; SSE-LABEL: test_zero_v8i32_align1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
-; SSE-NEXT:    movntiq %rax, 16(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_zero_v8i32_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v8i32_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v8i32_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v8i32_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 24(%rdi)
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v8i32_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
 ; AVX512-NEXT:    retq
   store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v16i16_align1(<16 x i16>* %dst) nounwind {
-; SSE-LABEL: test_zero_v16i16_align1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
-; SSE-NEXT:    movntiq %rax, 16(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_zero_v16i16_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v16i16_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v16i16_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v16i16_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 24(%rdi)
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v16i16_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
 ; AVX512-NEXT:    retq
   store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v32i8_align1(<32 x i8>* %dst) nounwind {
-; SSE-LABEL: test_zero_v32i8_align1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
-; SSE-NEXT:    movntiq %rax, 16(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_zero_v32i8_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v32i8_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v32i8_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v32i8_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 24(%rdi)
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v32i8_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
 ; AVX512-NEXT:    retq
   store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 1, !nontemporal !1
   ret void
@@ -666,76 +669,40 @@ define void @test_zero_v32i8_align16(<32 x i8>* %dst) nounwind {
 define void @test_zero_v8f64_align1(<8 x double>* %dst) nounwind {
 ; SSE-LABEL: test_zero_v8f64_align1:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    movntiq %rax, 24(%rdi)
 ; SSE-NEXT:    movntiq %rax, 16(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, 8(%rdi)
 ; SSE-NEXT:    movntiq %rax, (%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 56(%rdi)
 ; SSE-NEXT:    movntiq %rax, 48(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 40(%rdi)
 ; SSE-NEXT:    movntiq %rax, 32(%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v8f64_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
 ; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX-NEXT:    movntiq %rax, 56(%rdi)
 ; AVX-NEXT:    movntiq %rax, 48(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX-NEXT:    movntiq %rax, 40(%rdi)
 ; AVX-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v8f64_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX512-NEXT:    movntiq %rax, 56(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX512-NEXT:    movntiq %rax, 40(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX512-NEXT:    retq
   store <8 x double> zeroinitializer, <8 x double>* %dst, align 1, !nontemporal !1
@@ -743,78 +710,68 @@ define void @test_zero_v8f64_align1(<8 x double>* %dst) nounwind {
 }
 
 define void @test_zero_v16f32_align1(<16 x float>* %dst) nounwind {
-; SSE-LABEL: test_zero_v16f32_align1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
-; SSE-NEXT:    movntiq %rax, 16(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 56(%rdi)
-; SSE-NEXT:    movntiq %rax, 48(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 40(%rdi)
-; SSE-NEXT:    movntiq %rax, 32(%rdi)
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_zero_v16f32_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 56(%rdi)
+; SSE2-NEXT:    movntiq %rax, 48(%rdi)
+; SSE2-NEXT:    movntiq %rax, 40(%rdi)
+; SSE2-NEXT:    movntiq %rax, 32(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v16f32_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v16f32_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 56(%rdi)
+; SSE41-NEXT:    movntiq %rax, 48(%rdi)
+; SSE41-NEXT:    movntiq %rax, 40(%rdi)
+; SSE41-NEXT:    movntiq %rax, 32(%rdi)
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v16f32_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
 ; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX-NEXT:    movntiq %rax, 56(%rdi)
 ; AVX-NEXT:    movntiq %rax, 48(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX-NEXT:    movntiq %rax, 40(%rdi)
 ; AVX-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v16f32_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX512-NEXT:    movntiq %rax, 56(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX512-NEXT:    movntiq %rax, 40(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX512-NEXT:    retq
   store <16 x float> zeroinitializer, <16 x float>* %dst, align 1, !nontemporal !1
@@ -822,78 +779,68 @@ define void @test_zero_v16f32_align1(<16 x float>* %dst) nounwind {
 }
 
 define void @test_zero_v8i64_align1(<8 x i64>* %dst) nounwind {
-; SSE-LABEL: test_zero_v8i64_align1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
-; SSE-NEXT:    movntiq %rax, 16(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 56(%rdi)
-; SSE-NEXT:    movntiq %rax, 48(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 40(%rdi)
-; SSE-NEXT:    movntiq %rax, 32(%rdi)
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_zero_v8i64_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 56(%rdi)
+; SSE2-NEXT:    movntiq %rax, 48(%rdi)
+; SSE2-NEXT:    movntiq %rax, 40(%rdi)
+; SSE2-NEXT:    movntiq %rax, 32(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v8i64_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v8i64_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 56(%rdi)
+; SSE41-NEXT:    movntiq %rax, 48(%rdi)
+; SSE41-NEXT:    movntiq %rax, 40(%rdi)
+; SSE41-NEXT:    movntiq %rax, 32(%rdi)
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v8i64_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
 ; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX-NEXT:    movntiq %rax, 56(%rdi)
 ; AVX-NEXT:    movntiq %rax, 48(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX-NEXT:    movntiq %rax, 40(%rdi)
 ; AVX-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v8i64_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX512-NEXT:    movntiq %rax, 56(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX512-NEXT:    movntiq %rax, 40(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX512-NEXT:    retq
   store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 1, !nontemporal !1
@@ -901,78 +848,68 @@ define void @test_zero_v8i64_align1(<8 x i64>* %dst) nounwind {
 }
 
 define void @test_zero_v16i32_align1(<16 x i32>* %dst) nounwind {
-; SSE-LABEL: test_zero_v16i32_align1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
-; SSE-NEXT:    movntiq %rax, 16(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 56(%rdi)
-; SSE-NEXT:    movntiq %rax, 48(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 40(%rdi)
-; SSE-NEXT:    movntiq %rax, 32(%rdi)
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_zero_v16i32_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 56(%rdi)
+; SSE2-NEXT:    movntiq %rax, 48(%rdi)
+; SSE2-NEXT:    movntiq %rax, 40(%rdi)
+; SSE2-NEXT:    movntiq %rax, 32(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v16i32_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v16i32_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 56(%rdi)
+; SSE41-NEXT:    movntiq %rax, 48(%rdi)
+; SSE41-NEXT:    movntiq %rax, 40(%rdi)
+; SSE41-NEXT:    movntiq %rax, 32(%rdi)
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v16i32_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
 ; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX-NEXT:    movntiq %rax, 56(%rdi)
 ; AVX-NEXT:    movntiq %rax, 48(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX-NEXT:    movntiq %rax, 40(%rdi)
 ; AVX-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v16i32_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX512-NEXT:    movntiq %rax, 56(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX512-NEXT:    movntiq %rax, 40(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX512-NEXT:    retq
   store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 1, !nontemporal !1
@@ -980,78 +917,68 @@ define void @test_zero_v16i32_align1(<16 x i32>* %dst) nounwind {
 }
 
 define void @test_zero_v32i16_align1(<32 x i16>* %dst) nounwind {
-; SSE-LABEL: test_zero_v32i16_align1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
-; SSE-NEXT:    movntiq %rax, 16(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 56(%rdi)
-; SSE-NEXT:    movntiq %rax, 48(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 40(%rdi)
-; SSE-NEXT:    movntiq %rax, 32(%rdi)
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_zero_v32i16_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 56(%rdi)
+; SSE2-NEXT:    movntiq %rax, 48(%rdi)
+; SSE2-NEXT:    movntiq %rax, 40(%rdi)
+; SSE2-NEXT:    movntiq %rax, 32(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v32i16_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v32i16_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 56(%rdi)
+; SSE41-NEXT:    movntiq %rax, 48(%rdi)
+; SSE41-NEXT:    movntiq %rax, 40(%rdi)
+; SSE41-NEXT:    movntiq %rax, 32(%rdi)
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v32i16_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
 ; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX-NEXT:    movntiq %rax, 56(%rdi)
 ; AVX-NEXT:    movntiq %rax, 48(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX-NEXT:    movntiq %rax, 40(%rdi)
 ; AVX-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v32i16_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX512-NEXT:    movntiq %rax, 56(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX512-NEXT:    movntiq %rax, 40(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX512-NEXT:    retq
   store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 1, !nontemporal !1
@@ -1059,78 +986,68 @@ define void @test_zero_v32i16_align1(<32 x i16>* %dst) nounwind {
 }
 
 define void @test_zero_v64i8_align1(<64 x i8>* %dst) nounwind {
-; SSE-LABEL: test_zero_v64i8_align1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
-; SSE-NEXT:    movntiq %rax, 16(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 56(%rdi)
-; SSE-NEXT:    movntiq %rax, 48(%rdi)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 40(%rdi)
-; SSE-NEXT:    movntiq %rax, 32(%rdi)
-; SSE-NEXT:    retq
+; SSE2-LABEL: test_zero_v64i8_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 56(%rdi)
+; SSE2-NEXT:    movntiq %rax, 48(%rdi)
+; SSE2-NEXT:    movntiq %rax, 40(%rdi)
+; SSE2-NEXT:    movntiq %rax, 32(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v64i8_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v64i8_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 56(%rdi)
+; SSE41-NEXT:    movntiq %rax, 48(%rdi)
+; SSE41-NEXT:    movntiq %rax, 40(%rdi)
+; SSE41-NEXT:    movntiq %rax, 32(%rdi)
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v64i8_align1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
 ; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX-NEXT:    movntiq %rax, 56(%rdi)
 ; AVX-NEXT:    movntiq %rax, 48(%rdi)
-; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX-NEXT:    movntiq %rax, 40(%rdi)
 ; AVX-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v64i8_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX512-NEXT:    movntiq %rax, 56(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX512-NEXT:    movntiq %rax, 40(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX512-NEXT:    retq
   store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 1, !nontemporal !1