From 0a2d836fd998ef546fb78e24ebb04d210cb8a948 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Thu, 15 Aug 2019 18:23:37 +0000
Subject: [PATCH] [X86] Add custom type legalization for bitcasting mmx to
 v2i32/v4i16/v8i8 to use movq2dq instead of going through memory.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@369031 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp         |  10 ++
 lib/Target/X86/X86ISelLowering.h           |   4 +
 lib/Target/X86/X86InstrMMX.td              |   7 ++
 test/CodeGen/X86/3dnow-intrinsics.ll       |  24 ++--
 test/CodeGen/X86/mmx-arg-passing-x86-64.ll |   9 +-
 test/CodeGen/X86/mmx-arith.ll              | 126 ++++++++-------------
 test/CodeGen/X86/mmx-cvt.ll                |  16 +--
 test/CodeGen/X86/vec_extract-mmx.ll        |  16 +--
 test/CodeGen/X86/vec_insert-7.ll           |   5 +-
 9 files changed, 95 insertions(+), 122 deletions(-)

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 7e36a05e8b1..fa6cc53df3d 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -27823,6 +27823,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
+    if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
+      assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
+             "Unexpected type action!");
+      EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
+      SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0));
+      Results.push_back(Res);
+      return;
+    }
+
     return;
   }
   case ISD::MGATHER: {
@@ -27934,6 +27943,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
+  case X86ISD::MOVQ2DQ:            return "X86ISD::MOVQ2DQ";
   case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
   case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
   case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 09b0f6bc42b..3b39c85a6a8 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -144,6 +144,10 @@ namespace llvm {
       /// relative displacements.
       WrapperRIP,
 
+      /// Copies a 64-bit value from an MMX vector to the low word
+      /// of an XMM vector, with the high word zero filled.
+      MOVQ2DQ,
+
       /// Copies a 64-bit value from the low word of an XMM vector
       /// to an MMX vector.
       MOVDQ2Q,
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 54d4757dbd2..5125edf7834 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -567,6 +567,13 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
                                 (int_x86_mmx_pmovmskb VR64:$src))]>,
                           Sched<[WriteMMXMOVMSK]>;
 
+// MMX to XMM for vector types
+def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>;
+
+def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)),
+          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+
 // Low word of XMM to MMX.
 def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
                             [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
diff --git a/test/CodeGen/X86/3dnow-intrinsics.ll b/test/CodeGen/X86/3dnow-intrinsics.ll
index cb23655c039..611ba11c510 100644
--- a/test/CodeGen/X86/3dnow-intrinsics.ll
+++ b/test/CodeGen/X86/3dnow-intrinsics.ll
@@ -13,8 +13,7 @@ define <8 x i8> @test_pavgusb(x86_mmx %a.coerce, x86_mmx %b.coerce) nounwind rea
 ; X64-LABEL: test_pavgusb:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    pavgusb %mm1, %mm0
-; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast x86_mmx %a.coerce to <8 x i8>
@@ -50,8 +49,7 @@ define <2 x i32> @test_pf2id(<2 x float> %a) nounwind readnone {
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movdq2q %xmm0, %mm0
 ; X64-NEXT:    pf2id %mm0, %mm0
-; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <2 x float> %a to x86_mmx
@@ -166,8 +164,7 @@ define <2 x i32> @test_pfcmpeq(<2 x float> %a, <2 x float> %b) nounwind readnone
 ; X64-NEXT:    movdq2q %xmm1, %mm0
 ; X64-NEXT:    movdq2q %xmm0, %mm1
 ; X64-NEXT:    pfcmpeq %mm0, %mm1
-; X64-NEXT:    movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT:    movq2dq %mm1, %xmm0
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <2 x float> %a to x86_mmx
@@ -205,8 +202,7 @@ define <2 x i32> @test_pfcmpge(<2 x float> %a, <2 x float> %b) nounwind readnone
 ; X64-NEXT:    movdq2q %xmm1, %mm0
 ; X64-NEXT:    movdq2q %xmm0, %mm1
 ; X64-NEXT:    pfcmpge %mm0, %mm1
-; X64-NEXT:    movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT:    movq2dq %mm1, %xmm0
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <2 x float> %a to x86_mmx
@@ -244,8 +240,7 @@ define <2 x i32> @test_pfcmpgt(<2 x float> %a, <2 x float> %b) nounwind readnone
 ; X64-NEXT:    movdq2q %xmm1, %mm0
 ; X64-NEXT:    movdq2q %xmm0, %mm1
 ; X64-NEXT:    pfcmpgt %mm0, %mm1
-; X64-NEXT:    movq %mm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT:    movq2dq %mm1, %xmm0
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <2 x float> %a to x86_mmx
@@ -679,8 +674,7 @@ define <4 x i16> @test_pmulhrw(x86_mmx %a.coerce, x86_mmx %b.coerce) nounwind re
 ; X64-LABEL: test_pmulhrw:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    pmulhrw %mm1, %mm0
-; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast x86_mmx %a.coerce to <4 x i16>
@@ -716,8 +710,7 @@ define <2 x i32> @test_pf2iw(<2 x float> %a) nounwind readnone {
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movdq2q %xmm0, %mm0
 ; X64-NEXT:    pf2iw %mm0, %mm0
-; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <2 x float> %a to x86_mmx
@@ -891,8 +884,7 @@ define <2 x i32> @test_pswapdsi(<2 x i32> %a) nounwind readnone {
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movdq2q %xmm0, %mm0
 ; X64-NEXT:    pswapd %mm0, %mm0 # mm0 = mm0[1,0]
-; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    retq
 entry:
   %0 = bitcast <2 x i32> %a to x86_mmx
diff --git a/test/CodeGen/X86/mmx-arg-passing-x86-64.ll b/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
index 1f654109ab8..85fa9bb2a8f 100644
--- a/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
+++ b/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
@@ -22,12 +22,11 @@ define void @t3() nounwind  {
 define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind  {
 ; X86-64-LABEL: t4:
 ; X86-64:       ## %bb.0:
-; X86-64-NEXT:    movdq2q %xmm0, %mm0
-; X86-64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
 ; X86-64-NEXT:    movdq2q %xmm1, %mm0
-; X86-64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
-; X86-64-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0
-; X86-64-NEXT:    paddb -{{[0-9]+}}(%rsp), %xmm0
+; X86-64-NEXT:    movdq2q %xmm0, %mm1
+; X86-64-NEXT:    movq2dq %mm1, %xmm1
+; X86-64-NEXT:    movq2dq %mm0, %xmm0
+; X86-64-NEXT:    paddb %xmm1, %xmm0
 ; X86-64-NEXT:    movb $1, %al
 ; X86-64-NEXT:    jmp _pass_v8qi ## TAILCALL
   %v1a = bitcast x86_mmx %v1 to <8 x i8>
diff --git a/test/CodeGen/X86/mmx-arith.ll b/test/CodeGen/X86/mmx-arith.ll
index 0654606108d..e9c86af6503 100644
--- a/test/CodeGen/X86/mmx-arith.ll
+++ b/test/CodeGen/X86/mmx-arith.ll
@@ -8,15 +8,8 @@
 define void @test0(x86_mmx* %A, x86_mmx* %B) {
 ; X32-LABEL: test0:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    andl $-16, %esp
-; X32-NEXT:    subl $48, %esp
-; X32-NEXT:    movl 12(%ebp), %ecx
-; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X32-NEXT:    paddb %xmm0, %xmm1
@@ -25,9 +18,8 @@ define void @test0(x86_mmx* %A, x86_mmx* %B) {
 ; X32-NEXT:    paddsb (%ecx), %mm0
 ; X32-NEXT:    movq %mm0, (%eax)
 ; X32-NEXT:    paddusb (%ecx), %mm0
-; X32-NEXT:    movq %mm0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movdqa {{[0-9]+}}(%esp), %xmm0
 ; X32-NEXT:    movq %mm0, (%eax)
+; X32-NEXT:    movq2dq %mm0, %xmm0
 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X32-NEXT:    psubb %xmm1, %xmm0
 ; X32-NEXT:    movdq2q %xmm0, %mm0
@@ -35,29 +27,25 @@ define void @test0(x86_mmx* %A, x86_mmx* %B) {
 ; X32-NEXT:    psubsb (%ecx), %mm0
 ; X32-NEXT:    movq %mm0, (%eax)
 ; X32-NEXT:    psubusb (%ecx), %mm0
-; X32-NEXT:    movq %mm0, (%esp)
-; X32-NEXT:    movdqa (%esp), %xmm0
 ; X32-NEXT:    movq %mm0, (%eax)
+; X32-NEXT:    movq2dq %mm0, %xmm0
 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; X32-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-NEXT:    pmullw %xmm1, %xmm0
-; X32-NEXT:    pand {{\.LCPI.*}}, %xmm0
-; X32-NEXT:    packuswb %xmm0, %xmm0
-; X32-NEXT:    movq %xmm0, (%eax)
-; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    pand %xmm0, %xmm1
+; X32-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X32-NEXT:    pmullw %xmm0, %xmm1
+; X32-NEXT:    pand {{\.LCPI.*}}, %xmm1
+; X32-NEXT:    packuswb %xmm1, %xmm1
 ; X32-NEXT:    movq %xmm1, (%eax)
 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    por %xmm1, %xmm0
+; X32-NEXT:    pand %xmm1, %xmm0
 ; X32-NEXT:    movq %xmm0, (%eax)
 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    pxor %xmm0, %xmm1
+; X32-NEXT:    por %xmm0, %xmm1
 ; X32-NEXT:    movq %xmm1, (%eax)
+; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    pxor %xmm1, %xmm0
+; X32-NEXT:    movq %xmm0, (%eax)
 ; X32-NEXT:    emms
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    .cfi_def_cfa %esp, 4
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test0:
@@ -70,9 +58,8 @@ define void @test0(x86_mmx* %A, x86_mmx* %B) {
 ; X64-NEXT:    paddsb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    paddusb (%rsi), %mm0
-; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0
 ; X64-NEXT:    movq %mm0, (%rdi)
+; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    psubb %xmm1, %xmm0
 ; X64-NEXT:    movdq2q %xmm0, %mm0
@@ -80,25 +67,24 @@ define void @test0(x86_mmx* %A, x86_mmx* %B) {
 ; X64-NEXT:    psubsb (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    psubusb (%rsi), %mm0
-; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0
 ; X64-NEXT:    movq %mm0, (%rdi)
+; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-NEXT:    pmullw %xmm1, %xmm0
-; X64-NEXT:    pand {{.*}}(%rip), %xmm0
-; X64-NEXT:    packuswb %xmm0, %xmm0
-; X64-NEXT:    movq %xmm0, (%rdi)
-; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    pand %xmm0, %xmm1
+; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X64-NEXT:    pmullw %xmm0, %xmm1
+; X64-NEXT:    pand {{.*}}(%rip), %xmm1
+; X64-NEXT:    packuswb %xmm1, %xmm1
 ; X64-NEXT:    movq %xmm1, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    por %xmm1, %xmm0
+; X64-NEXT:    pand %xmm1, %xmm0
 ; X64-NEXT:    movq %xmm0, (%rdi)
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    pxor %xmm0, %xmm1
+; X64-NEXT:    por %xmm0, %xmm1
 ; X64-NEXT:    movq %xmm1, (%rdi)
+; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    pxor %xmm1, %xmm0
+; X64-NEXT:    movq %xmm0, (%rdi)
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
@@ -253,15 +239,8 @@ entry:
 define void @test2(x86_mmx* %A, x86_mmx* %B) {
 ; X32-LABEL: test2:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    andl $-16, %esp
-; X32-NEXT:    subl $64, %esp
-; X32-NEXT:    movl 12(%ebp), %ecx
-; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X32-NEXT:    paddw %xmm0, %xmm1
@@ -270,9 +249,8 @@ define void @test2(x86_mmx* %A, x86_mmx* %B) {
 ; X32-NEXT:    paddsw (%ecx), %mm0
 ; X32-NEXT:    movq %mm0, (%eax)
 ; X32-NEXT:    paddusw (%ecx), %mm0
-; X32-NEXT:    movq %mm0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movdqa {{[0-9]+}}(%esp), %xmm0
 ; X32-NEXT:    movq %mm0, (%eax)
+; X32-NEXT:    movq2dq %mm0, %xmm0
 ; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X32-NEXT:    psubw %xmm1, %xmm0
 ; X32-NEXT:    movdq2q %xmm0, %mm0
@@ -280,30 +258,27 @@ define void @test2(x86_mmx* %A, x86_mmx* %B) {
 ; X32-NEXT:    psubsw (%ecx), %mm0
 ; X32-NEXT:    movq %mm0, (%eax)
 ; X32-NEXT:    psubusw (%ecx), %mm0
-; X32-NEXT:    movq %mm0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movq %mm0, (%eax)
-; X32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    pmullw {{[0-9]+}}(%esp), %xmm0
-; X32-NEXT:    movdq2q %xmm0, %mm0
-; X32-NEXT:    movq %xmm0, (%eax)
+; X32-NEXT:    movq2dq %mm0, %xmm0
+; X32-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT:    pmullw %xmm0, %xmm1
+; X32-NEXT:    movdq2q %xmm1, %mm0
+; X32-NEXT:    movq %xmm1, (%eax)
 ; X32-NEXT:    pmulhw (%ecx), %mm0
 ; X32-NEXT:    movq %mm0, (%eax)
 ; X32-NEXT:    pmaddwd (%ecx), %mm0
-; X32-NEXT:    movq %mm0, (%esp)
 ; X32-NEXT:    movq %mm0, (%eax)
-; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    andps (%esp), %xmm0
-; X32-NEXT:    movlps %xmm0, (%eax)
+; X32-NEXT:    movq2dq %mm0, %xmm0
 ; X32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT:    orps %xmm0, %xmm1
+; X32-NEXT:    andps %xmm0, %xmm1
 ; X32-NEXT:    movlps %xmm1, (%eax)
 ; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    xorps %xmm1, %xmm0
+; X32-NEXT:    orps %xmm1, %xmm0
 ; X32-NEXT:    movlps %xmm0, (%eax)
+; X32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X32-NEXT:    xorps %xmm0, %xmm1
+; X32-NEXT:    movlps %xmm1, (%eax)
 ; X32-NEXT:    emms
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    .cfi_def_cfa %esp, 4
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test2:
@@ -316,9 +291,8 @@ define void @test2(x86_mmx* %A, x86_mmx* %B) {
 ; X64-NEXT:    paddsw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    paddusw (%rsi), %mm0
-; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0
 ; X64-NEXT:    movq %mm0, (%rdi)
+; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    psubw %xmm1, %xmm0
 ; X64-NEXT:    movdq2q %xmm0, %mm0
@@ -326,26 +300,26 @@ define void @test2(x86_mmx* %A, x86_mmx* %B) {
 ; X64-NEXT:    psubsw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    psubusw (%rsi), %mm0
-; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movq %mm0, (%rdi)
-; X64-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    pmullw -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    movdq2q %xmm0, %mm0
-; X64-NEXT:    movq %xmm0, (%rdi)
+; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    pmullw %xmm0, %xmm1
+; X64-NEXT:    movdq2q %xmm1, %mm0
+; X64-NEXT:    movq %xmm1, (%rdi)
 ; X64-NEXT:    pmulhw (%rsi), %mm0
 ; X64-NEXT:    movq %mm0, (%rdi)
 ; X64-NEXT:    pmaddwd (%rsi), %mm0
-; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movq %mm0, (%rdi)
-; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    andps -{{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    movlps %xmm0, (%rdi)
+; X64-NEXT:    movq2dq %mm0, %xmm0
 ; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; X64-NEXT:    orps %xmm0, %xmm1
+; X64-NEXT:    andps %xmm0, %xmm1
 ; X64-NEXT:    movlps %xmm1, (%rdi)
 ; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT:    xorps %xmm1, %xmm0
+; X64-NEXT:    orps %xmm1, %xmm0
 ; X64-NEXT:    movlps %xmm0, (%rdi)
+; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    xorps %xmm0, %xmm1
+; X64-NEXT:    movlps %xmm1, (%rdi)
 ; X64-NEXT:    emms
 ; X64-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/mmx-cvt.ll b/test/CodeGen/X86/mmx-cvt.ll
index 5f6a8885b61..339df30892a 100644
--- a/test/CodeGen/X86/mmx-cvt.ll
+++ b/test/CodeGen/X86/mmx-cvt.ll
@@ -294,26 +294,20 @@ define <2 x double> @sitofp_v2i32_v2f64(<1 x i64>*) nounwind {
 define <4 x float> @sitofp_v2i32_v2f32(<1 x i64>*) nounwind {
 ; X86-LABEL: sitofp_v2i32_v2f32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
-; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movq (%eax), %mm0
 ; X86-NEXT:    paddd %mm0, %mm0
-; X86-NEXT:    movq %mm0, (%esp)
-; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movq2dq %mm0, %xmm0
+; X86-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 ; X86-NEXT:    cvtdq2ps %xmm0, %xmm0
-; X86-NEXT:    movl %ebp, %esp
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: sitofp_v2i32_v2f32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %mm0
 ; X64-NEXT:    paddd %mm0, %mm0
-; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 ; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X64-NEXT:    retq
   %2 = bitcast <1 x i64>* %0 to x86_mmx*
diff --git a/test/CodeGen/X86/vec_extract-mmx.ll b/test/CodeGen/X86/vec_extract-mmx.ll
index e6a7232adbf..5ce632d0aa1 100644
--- a/test/CodeGen/X86/vec_extract-mmx.ll
+++ b/test/CodeGen/X86/vec_extract-mmx.ll
@@ -113,20 +113,16 @@ define i32 @test3(x86_mmx %a) nounwind {
 define i32 @test4(x86_mmx %a) nounwind {
 ; X32-LABEL: test4:
 ; X32:       # %bb.0:
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    andl $-16, %esp
-; X32-NEXT:    subl $32, %esp
-; X32-NEXT:    movq %mm0, (%esp)
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
+; X32-NEXT:    movq2dq %mm0, %xmm0
+; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X32-NEXT:    movd %xmm0, %eax
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test4:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X64-NEXT:    movd %xmm0, %eax
 ; X64-NEXT:    retq
   %tmp0 = bitcast x86_mmx %a to <2 x i32>
   %tmp1 = extractelement <2 x i32> %tmp0, i32 1
diff --git a/test/CodeGen/X86/vec_insert-7.ll b/test/CodeGen/X86/vec_insert-7.ll
index 2c719e61c57..52d6e7ca7a2 100644
--- a/test/CodeGen/X86/vec_insert-7.ll
+++ b/test/CodeGen/X86/vec_insert-7.ll
@@ -8,15 +8,12 @@
 define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
 ; X32-LABEL: mmx_movzl:
 ; X32:       ## %bb.0:
-; X32-NEXT:    subl $28, %esp
-; X32-NEXT:    movq %mm0, (%esp)
-; X32-NEXT:    movdqa (%esp), %xmm0
+; X32-NEXT:    movq2dq %mm0, %xmm0
 ; X32-NEXT:    movl $32, %eax
 ; X32-NEXT:    pinsrd $0, %eax, %xmm0
 ; X32-NEXT:    pxor %xmm1, %xmm1
 ; X32-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
 ; X32-NEXT:    movdq2q %xmm1, %mm0
-; X32-NEXT:    addl $28, %esp
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: mmx_movzl:
-- 
2.40.0