From 4396102a68503715f3cf30b586fac8a5ec92ae98 Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Tue, 19 Mar 2019 16:52:00 +0000 Subject: [PATCH] [DAGCombine] Fix a miscompile when reducing BUILD_VECTORs to a shuffle In r311255 we added a case where we split vectors whose elements are all derived from the same input vector so that we could shuffle it more efficiently. In doing so, createBuildVecShuffle was taught to adjust for the fact that all indices would be based off of the first vector when this happens, but it's possible for the code that checked that to fire incorrectly if we happen to have a BUILD_VECTOR of extracts from subvectors and don't hit this new optimization. Instead of trying to detect if we've split the vector by checking if we have extracts from the same base vector, we can just pass that information into createBuildVecShuffle, avoiding the miscompile. Differential Revision: https://reviews.llvm.org/D59507 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@356476 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 20 ++++---- test/CodeGen/X86/shuffle-extract-subvector.ll | 48 +++++++++++++++++++ 2 files changed, 57 insertions(+), 11 deletions(-) create mode 100644 test/CodeGen/X86/shuffle-extract-subvector.ll diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index cb19c0a4e0f..4d85532a81b 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -475,7 +475,8 @@ namespace { SDValue reduceBuildVecToShuffle(SDNode *N); SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N, ArrayRef VectorMask, SDValue VecIn1, - SDValue VecIn2, unsigned LeftIdx); + SDValue VecIn2, unsigned LeftIdx, + bool DidSplitVec); SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast); /// Walk up chain skipping non-aliasing memory nodes, @@ -16416,7 +16417,7 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) { SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, ArrayRef VectorMask, SDValue VecIn1, SDValue VecIn2, - unsigned LeftIdx) { + unsigned LeftIdx, bool DidSplitVec) { MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); SDValue ZeroIdx = DAG.getConstant(0, DL, IdxTy); @@ -16424,17 +16425,12 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, EVT InVT1 = VecIn1.getValueType(); EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1; - unsigned Vec2Offset = 0; unsigned NumElems = VT.getVectorNumElements(); unsigned ShuffleNumElems = NumElems; - // In case both the input vectors are extracted from same base - // vector we do not need extra addend (Vec2Offset) while - // computing shuffle mask. - if (!VecIn2 || !(VecIn1.getOpcode() == ISD::EXTRACT_SUBVECTOR) || - !(VecIn2.getOpcode() == ISD::EXTRACT_SUBVECTOR) || - !(VecIn1.getOperand(0) == VecIn2.getOperand(0))) - Vec2Offset = InVT1.getVectorNumElements(); + // If we artificially split a vector in two already, then the offsets in the + // operands will all be based off of VecIn1, even those in VecIn2. + unsigned Vec2Offset = DidSplitVec ? 0 : InVT1.getVectorNumElements(); // We can't generate a shuffle node with mismatched input and output types. // Try to make the types match the type of the output. @@ -16693,6 +16689,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { // vector, then split the vector efficiently based on the maximum // vector access index and adjust the VectorMask and // VecIn accordingly. + bool DidSplitVec = false; if (VecIn.size() == 2) { unsigned MaxIndex = 0; unsigned NearestPow2 = 0; @@ -16723,6 +16720,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { VecIn.pop_back(); VecIn.push_back(VecIn1); VecIn.push_back(VecIn2); + DidSplitVec = true; for (unsigned i = 0; i < NumElems; i++) { if (VectorMask[i] <= 0) @@ -16760,7 +16758,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { (LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue(); if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft, - VecRight, LeftIdx)) + VecRight, LeftIdx, DidSplitVec)) Shuffles.push_back(Shuffle); else return SDValue(); diff --git a/test/CodeGen/X86/shuffle-extract-subvector.ll b/test/CodeGen/X86/shuffle-extract-subvector.ll new file mode 100644 index 00000000000..02ec475b05d --- /dev/null +++ b/test/CodeGen/X86/shuffle-extract-subvector.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +define void @f(<4 x half>* %a, <4 x half>* %b, <8 x half>* %c) { +; CHECK-LABEL: f: +; CHECK: # %bb.0: +; CHECK-NEXT: movzwl (%rdi), %r8d +; CHECK-NEXT: movzwl 2(%rdi), %r9d +; CHECK-NEXT: movzwl 4(%rdi), %r11d +; CHECK-NEXT: movzwl 6(%rdi), %edi +; CHECK-NEXT: movzwl (%rsi), %r10d +; CHECK-NEXT: movzwl 2(%rsi), %ecx +; CHECK-NEXT: movzwl 4(%rsi), %eax +; CHECK-NEXT: movzwl 6(%rsi), %esi +; CHECK-NEXT: movw %si, 14(%rdx) +; CHECK-NEXT: movw %di, 12(%rdx) +; CHECK-NEXT: movw %ax, 10(%rdx) +; CHECK-NEXT: movw %r11w, 8(%rdx) +; CHECK-NEXT: movw %cx, 6(%rdx) +; CHECK-NEXT: movw %r9w, 4(%rdx) +; CHECK-NEXT: movw %r10w, 2(%rdx) +; CHECK-NEXT: movw %r8w, (%rdx) +; CHECK-NEXT: retq + %tmp4 = load <4 x half>, <4 x half>* %a + %tmp5 = load <4 x half>, <4 x half>* %b + %tmp7 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> + %tmp8 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> + %tmp9 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> + %tmp10 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> + %tmp11 = extractelement <2 x half> %tmp7, i32 0 + %tmp12 = insertelement <8 x half> undef, half %tmp11, i32 0 + %tmp13 = extractelement <2 x half> %tmp7, i32 1 + %tmp14 = insertelement <8 x half> %tmp12, half %tmp13, i32 1 + %tmp15 = extractelement <2 x half> %tmp8, i32 0 + %tmp16 = insertelement <8 x half> %tmp14, half %tmp15, i32 2 + %tmp17 = extractelement <2 x half> %tmp8, i32 1 + %tmp18 = insertelement <8 x half> %tmp16, half %tmp17, i32 3 + %tmp19 = extractelement <2 x half> %tmp9, i32 0 + %tmp20 = insertelement <8 x half> %tmp18, half %tmp19, i32 4 + %tmp21 = extractelement <2 x half> %tmp9, i32 1 + %tmp22 = insertelement <8 x half> %tmp20, half %tmp21, i32 5 + %tmp23 = extractelement <2 x half> %tmp10, i32 0 + %tmp24 = insertelement <8 x half> %tmp22, half %tmp23, i32 6 + %tmp25 = extractelement <2 x half> %tmp10, i32 1 + %tmp26 = insertelement <8 x half> %tmp24, half %tmp25, i32 7 + store <8 x half> %tmp26, <8 x half>* %c + ret void +} -- 2.40.0