From 1f42623bce9db9e4ca7989a5eac231c9e9d0a0ef Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Sun, 17 Sep 2017 22:36:41 +0000
Subject: [PATCH] [X86] Teach shuffle lowering to use MOVLHPS/MOVHLPS for
 lowering v4f32 unary shuffles with SSE1 only.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@313504 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp      | 22 +++++++++++++++++-----
 test/CodeGen/X86/vector-shuffle-sse1.ll |  4 ++--
 2 files changed, 19 insertions(+), 7 deletions(-)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 4df7621bee2..d327c98be69 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -10725,6 +10725,15 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
     }
 
+    // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
+    // in SSE1 because otherwise they are widened to v2f64 and never get here.
+    if (!Subtarget.hasSSE2()) {
+      if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
+        return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
+      if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
+        return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
+    }
+
     // Otherwise, use a straight shuffle of a single input vector. We pass the
     // input vector to both operands to simulate this with a SHUFPS.
     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
@@ -10757,11 +10766,14 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
         return BlendPerm;
   }
 
-  // Use low/high mov instructions.
-  if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
-    return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
-  if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
-    return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
+  // Use low/high mov instructions. These are only valid in SSE1 because
+  // otherwise they are widened to v2f64 and never get here.
+  if (!Subtarget.hasSSE2()) {
+    if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
+      return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
+    if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
+      return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
+  }
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (SDValue V =
diff --git a/test/CodeGen/X86/vector-shuffle-sse1.ll b/test/CodeGen/X86/vector-shuffle-sse1.ll
index 22b5102c06f..cf8e8eb8a12 100644
--- a/test/CodeGen/X86/vector-shuffle-sse1.ll
+++ b/test/CodeGen/X86/vector-shuffle-sse1.ll
@@ -112,7 +112,7 @@ define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @shuffle_v4f32_0101(<4 x float> %a, <4 x float> %b) {
 ; SSE1-LABEL: shuffle_v4f32_0101:
 ; SSE1:       # BB#0:
-; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE1-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
 ; SSE1-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   ret <4 x float> %shuffle
@@ -121,7 +121,7 @@ define <4 x float> @shuffle_v4f32_0101(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @shuffle_v4f32_2323(<4 x float> %a, <4 x float> %b) {
 ; SSE1-LABEL: shuffle_v4f32_2323:
 ; SSE1:       # BB#0:
-; SSE1-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE1-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
 ; SSE1-NEXT:    retq
   %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
   ret <4 x float> %shuffle
-- 
2.50.0