From 4396102a68503715f3cf30b586fac8a5ec92ae98 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Tue, 19 Mar 2019 16:52:00 +0000
Subject: [PATCH] [DAGCombine] Fix a miscompile when reducing BUILD_VECTORs to
 a shuffle

In r311255 we added a case where we split vectors whose elements are
all derived from the same input vector so that we could shuffle it
more efficiently. In doing so, createBuildVecShuffle was taught to
adjust for the fact that all indices would be based off of the first
vector when this happens, but it's possible for the code that checked
that to fire incorrectly if we happen to have a BUILD_VECTOR of
extracts from subvectors and don't hit this new optimization.

Instead of trying to detect if we've split the vector by checking if
we have extracts from the same base vector, we can just pass that
information into createBuildVecShuffle, avoiding the miscompile.

Differential Revision: https://reviews.llvm.org/D59507

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@356476 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      | 20 ++++----
 test/CodeGen/X86/shuffle-extract-subvector.ll | 48 +++++++++++++++++++
 2 files changed, 57 insertions(+), 11 deletions(-)
 create mode 100644 test/CodeGen/X86/shuffle-extract-subvector.ll
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index cb19c0a4e0f..4d85532a81b 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -475,7 +475,8 @@ namespace {
     SDValue reduceBuildVecToShuffle(SDNode *N);
     SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
                                   ArrayRef<int> VectorMask, SDValue VecIn1,
-                                  SDValue VecIn2, unsigned LeftIdx);
+                                  SDValue VecIn2, unsigned LeftIdx,
+                                  bool DidSplitVec);
     SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast);
 
     /// Walk up chain skipping non-aliasing memory nodes,
@@ -16416,7 +16417,7 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
 SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
                                            ArrayRef<int> VectorMask,
                                            SDValue VecIn1, SDValue VecIn2,
-                                           unsigned LeftIdx) {
+                                           unsigned LeftIdx, bool DidSplitVec) {
   MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
   SDValue ZeroIdx = DAG.getConstant(0, DL, IdxTy);
 
@@ -16424,17 +16425,12 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
   EVT InVT1 = VecIn1.getValueType();
   EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1;
 
-  unsigned Vec2Offset = 0;
   unsigned NumElems = VT.getVectorNumElements();
   unsigned ShuffleNumElems = NumElems;
 
-  // In case both the input vectors are extracted from same base
-  // vector we do not need extra addend (Vec2Offset) while
-  // computing shuffle mask.
-  if (!VecIn2 || !(VecIn1.getOpcode() == ISD::EXTRACT_SUBVECTOR) ||
-      !(VecIn2.getOpcode() == ISD::EXTRACT_SUBVECTOR) ||
-      !(VecIn1.getOperand(0) == VecIn2.getOperand(0)))
-    Vec2Offset = InVT1.getVectorNumElements();
+  // If we artificially split a vector in two already, then the offsets in the
+  // operands will all be based off of VecIn1, even those in VecIn2.
+  unsigned Vec2Offset = DidSplitVec ? 0 : InVT1.getVectorNumElements();
 
   // We can't generate a shuffle node with mismatched input and output types.
   // Try to make the types match the type of the output.
@@ -16693,6 +16689,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
   // vector, then split the vector efficiently based on the maximum
   // vector access index and adjust the VectorMask and
   // VecIn accordingly.
+  bool DidSplitVec = false;
   if (VecIn.size() == 2) {
     unsigned MaxIndex = 0;
     unsigned NearestPow2 = 0;
@@ -16723,6 +16720,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
         VecIn.pop_back();
         VecIn.push_back(VecIn1);
         VecIn.push_back(VecIn2);
+        DidSplitVec = true;
 
         for (unsigned i = 0; i < NumElems; i++) {
           if (VectorMask[i] <= 0)
@@ -16760,7 +16758,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
         (LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue();
 
     if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft,
-                                                VecRight, LeftIdx))
+                                                VecRight, LeftIdx, DidSplitVec))
       Shuffles.push_back(Shuffle);
     else
       return SDValue();
diff --git a/test/CodeGen/X86/shuffle-extract-subvector.ll b/test/CodeGen/X86/shuffle-extract-subvector.ll
new file mode 100644
index 00000000000..02ec475b05d
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-extract-subvector.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+define void @f(<4 x half>* %a, <4 x half>* %b, <8 x half>* %c) {
+; CHECK-LABEL: f:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movzwl (%rdi), %r8d
+; CHECK-NEXT:    movzwl 2(%rdi), %r9d
+; CHECK-NEXT:    movzwl 4(%rdi), %r11d
+; CHECK-NEXT:    movzwl 6(%rdi), %edi
+; CHECK-NEXT:    movzwl (%rsi), %r10d
+; CHECK-NEXT:    movzwl 2(%rsi), %ecx
+; CHECK-NEXT:    movzwl 4(%rsi), %eax
+; CHECK-NEXT:    movzwl 6(%rsi), %esi
+; CHECK-NEXT:    movw %si, 14(%rdx)
+; CHECK-NEXT:    movw %di, 12(%rdx)
+; CHECK-NEXT:    movw %ax, 10(%rdx)
+; CHECK-NEXT:    movw %r11w, 8(%rdx)
+; CHECK-NEXT:    movw %cx, 6(%rdx)
+; CHECK-NEXT:    movw %r9w, 4(%rdx)
+; CHECK-NEXT:    movw %r10w, 2(%rdx)
+; CHECK-NEXT:    movw %r8w, (%rdx)
+; CHECK-NEXT:    retq
+  %tmp4 = load <4 x half>, <4 x half>* %a
+  %tmp5 = load <4 x half>, <4 x half>* %b
+  %tmp7 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> <i32 0, i32 4>
+  %tmp8 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> <i32 1, i32 5>
+  %tmp9 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> <i32 2, i32 6>
+  %tmp10 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> <i32 3, i32 7>
+  %tmp11 = extractelement <2 x half> %tmp7, i32 0
+  %tmp12 = insertelement <8 x half> undef, half %tmp11, i32 0
+  %tmp13 = extractelement <2 x half> %tmp7, i32 1
+  %tmp14 = insertelement <8 x half> %tmp12, half %tmp13, i32 1
+  %tmp15 = extractelement <2 x half> %tmp8, i32 0
+  %tmp16 = insertelement <8 x half> %tmp14, half %tmp15, i32 2
+  %tmp17 = extractelement <2 x half> %tmp8, i32 1
+  %tmp18 = insertelement <8 x half> %tmp16, half %tmp17, i32 3
+  %tmp19 = extractelement <2 x half> %tmp9, i32 0
+  %tmp20 = insertelement <8 x half> %tmp18, half %tmp19, i32 4
+  %tmp21 = extractelement <2 x half> %tmp9, i32 1
+  %tmp22 = insertelement <8 x half> %tmp20, half %tmp21, i32 5
+  %tmp23 = extractelement <2 x half> %tmp10, i32 0
+  %tmp24 = insertelement <8 x half> %tmp22, half %tmp23, i32 6
+  %tmp25 = extractelement <2 x half> %tmp10, i32 1
+  %tmp26 = insertelement <8 x half> %tmp24, half %tmp25, i32 7
+  store <8 x half> %tmp26, <8 x half>* %c
+  ret void
+}
-- 
2.40.0