From 7e1d1537d39eaeccdc86843fc8d0e51ab82dcec9 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Mon, 11 Dec 2017 15:19:31 +0000
Subject: [PATCH] [DAGCombiner] protect against an infinite loop between shl
 <--> mul (PR35579)

At first, I tried to thread the x86 needle and use a target hook (isVectorShiftByScalarCheap())
to disable the transform only for non-splat pow-of-2 constants, but not AVX2, but only some
element types, but...it's difficult.

Here we just avoid the loop with the x86 vector transform that conflicts with the general DAG
combine and preserve all of the existing behavior AFAICT otherwise.

Some tests that will probably fail if someone does try to restrict this in a more targeted way
for x86-only may be found in:

test/CodeGen/X86/combine-mul.ll
test/CodeGen/X86/vector-mul.ll
test/CodeGen/X86/widen_arith-5.ll

This should prevent the infinite looping seen with:
https://bugs.llvm.org/show_bug.cgi?id=35579

Differential Revision: https://reviews.llvm.org/D41040


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@320374 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  3 ++-
 test/CodeGen/X86/combine-mul.ll          | 34 ++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d8ac4a52946..da2ca8851e3 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2675,7 +2675,8 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   }
   // fold (mul x, (1 << c)) -> x << c
   if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
-      DAG.isKnownToBeAPowerOfTwo(N1)) {
+      DAG.isKnownToBeAPowerOfTwo(N1) &&
+      (!VT.isVector() || Level <= AfterLegalizeVectorOps)) {
     SDLoc DL(N);
     SDValue LogBase2 = BuildLogBase2(N1, DL);
     AddToWorklist(LogBase2.getNode());
diff --git a/test/CodeGen/X86/combine-mul.ll b/test/CodeGen/X86/combine-mul.ll
index 21733acac1f..f021788e245 100644
--- a/test/CodeGen/X86/combine-mul.ll
+++ b/test/CodeGen/X86/combine-mul.ll
@@ -308,3 +308,37 @@ define <4 x i32> @combine_vec_mul_add(<4 x i32> %x) {
   %2 = mul <4 x i32> %1, <i32 4, i32 6, i32 2, i32 0>
   ret <4 x i32> %2
 }
+
+; This would infinite loop because DAGCombiner wants to turn this into a shift,
+; but x86 lowering wants to avoid non-uniform vector shift amounts.
+
+define <16 x i8> @PR35579(<16 x i8> %x) {
+; SSE-LABEL: PR35579:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmovsxbw %xmm0, %xmm1
+; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    pmovsxbw %xmm0, %xmm0
+; SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: PR35579:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %r = mul <16 x i8> %x, <i8 0, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1, i8 8, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1>
+  ret <16 x i8> %r
+}
+
-- 
2.50.1