From 46079999e9111e8e8c2c1f0511477c3bec38d74a Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 29 Sep 2016 19:16:52 +0000
Subject: [PATCH] [3.9.1] Merging r280837 [X86] Don't reduce the width of
 vector mul if the target doesn't support SSE2.

The patch is to fix PR30298, which is caused by rL272694. The solution is to
bail out if the target has no SSE2.

Differential Revision: https://reviews.llvm.org/D24288

git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_39@282753 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp |  3 ++-
 test/CodeGen/X86/pr30298.ll        | 43 ++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 test/CodeGen/X86/pr30298.ll

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 2c548384f1c..75c58525b77 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -27516,7 +27516,8 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
                                const X86Subtarget &Subtarget) {
   // pmulld is supported since SSE41. It is better to use pmulld
   // instead of pmullw+pmulhw.
-  if (Subtarget.hasSSE41())
+  // pmullw/pmulhw are not supported by SSE.
+  if (Subtarget.hasSSE41() || !Subtarget.hasSSE2())
     return SDValue();
 
   ShrinkMode Mode;
diff --git a/test/CodeGen/X86/pr30298.ll b/test/CodeGen/X86/pr30298.ll
new file mode 100644
index 00000000000..1e6dad0b20d
--- /dev/null
+++ b/test/CodeGen/X86/pr30298.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i386-pc-linux-gnu -mattr=+sse < %s | FileCheck %s
+
+@c = external global i32*, align 8
+
+define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
+; CHECK-LABEL: mul_2xi8:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl c, %esi
+; CHECK-NEXT:    movzbl 1(%edx,%ecx), %edi
+; CHECK-NEXT:    movzbl (%edx,%ecx), %edx
+; CHECK-NEXT:    movzbl 1(%eax,%ecx), %ebx
+; CHECK-NEXT:    movzbl (%eax,%ecx), %eax
+; CHECK-NEXT:    imull %edx, %eax
+; CHECK-NEXT:    imull %edi, %ebx
+; CHECK-NEXT:    movl %ebx, 4(%esi,%ecx,4)
+; CHECK-NEXT:    movl %eax, (%esi,%ecx,4)
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    retl
+entry:
+  %pre = load i32*, i32** @c
+  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
+  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
+  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
+  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
+  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
+  %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
+  %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
+  %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
+  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
+  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
+  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
+  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
+  ret void
+}
-- 
2.40.0