SDValue reduceBuildVecToShuffle(SDNode *N);
SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
ArrayRef<int> VectorMask, SDValue VecIn1,
- SDValue VecIn2, unsigned LeftIdx);
+ SDValue VecIn2, unsigned LeftIdx,
+ bool DidSplitVec);
SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast);
/// Walk up chain skipping non-aliasing memory nodes,
SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
ArrayRef<int> VectorMask,
SDValue VecIn1, SDValue VecIn2,
- unsigned LeftIdx) {
+ unsigned LeftIdx, bool DidSplitVec) {
MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
SDValue ZeroIdx = DAG.getConstant(0, DL, IdxTy);
EVT InVT1 = VecIn1.getValueType();
EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1;
- unsigned Vec2Offset = 0;
unsigned NumElems = VT.getVectorNumElements();
unsigned ShuffleNumElems = NumElems;
- // In case both the input vectors are extracted from same base
- // vector we do not need extra addend (Vec2Offset) while
- // computing shuffle mask.
- if (!VecIn2 || !(VecIn1.getOpcode() == ISD::EXTRACT_SUBVECTOR) ||
- !(VecIn2.getOpcode() == ISD::EXTRACT_SUBVECTOR) ||
- !(VecIn1.getOperand(0) == VecIn2.getOperand(0)))
- Vec2Offset = InVT1.getVectorNumElements();
+ // If we artificially split a vector in two already, then the offsets in the
+ // operands will all be based off of VecIn1, even those in VecIn2.
+ unsigned Vec2Offset = DidSplitVec ? 0 : InVT1.getVectorNumElements();
// We can't generate a shuffle node with mismatched input and output types.
// Try to make the types match the type of the output.
// vector, then split the vector efficiently based on the maximum
// vector access index and adjust the VectorMask and
// VecIn accordingly.
+ bool DidSplitVec = false;
if (VecIn.size() == 2) {
unsigned MaxIndex = 0;
unsigned NearestPow2 = 0;
VecIn.pop_back();
VecIn.push_back(VecIn1);
VecIn.push_back(VecIn2);
+ DidSplitVec = true;
for (unsigned i = 0; i < NumElems; i++) {
if (VectorMask[i] <= 0)
(LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue();
if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft,
- VecRight, LeftIdx))
+ VecRight, LeftIdx, DidSplitVec))
Shuffles.push_back(Shuffle);
else
return SDValue();
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+define void @f(<4 x half>* %a, <4 x half>* %b, <8 x half>* %c) {
+; CHECK-LABEL: f:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movzwl (%rdi), %r8d
+; CHECK-NEXT: movzwl 2(%rdi), %r9d
+; CHECK-NEXT: movzwl 4(%rdi), %r11d
+; CHECK-NEXT: movzwl 6(%rdi), %edi
+; CHECK-NEXT: movzwl (%rsi), %r10d
+; CHECK-NEXT: movzwl 2(%rsi), %ecx
+; CHECK-NEXT: movzwl 4(%rsi), %eax
+; CHECK-NEXT: movzwl 6(%rsi), %esi
+; CHECK-NEXT: movw %si, 14(%rdx)
+; CHECK-NEXT: movw %di, 12(%rdx)
+; CHECK-NEXT: movw %ax, 10(%rdx)
+; CHECK-NEXT: movw %r11w, 8(%rdx)
+; CHECK-NEXT: movw %cx, 6(%rdx)
+; CHECK-NEXT: movw %r9w, 4(%rdx)
+; CHECK-NEXT: movw %r10w, 2(%rdx)
+; CHECK-NEXT: movw %r8w, (%rdx)
+; CHECK-NEXT: retq
+ %tmp4 = load <4 x half>, <4 x half>* %a
+ %tmp5 = load <4 x half>, <4 x half>* %b
+ %tmp7 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> <i32 0, i32 4>
+ %tmp8 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> <i32 1, i32 5>
+ %tmp9 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> <i32 2, i32 6>
+ %tmp10 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> <i32 3, i32 7>
+ %tmp11 = extractelement <2 x half> %tmp7, i32 0
+ %tmp12 = insertelement <8 x half> undef, half %tmp11, i32 0
+ %tmp13 = extractelement <2 x half> %tmp7, i32 1
+ %tmp14 = insertelement <8 x half> %tmp12, half %tmp13, i32 1
+ %tmp15 = extractelement <2 x half> %tmp8, i32 0
+ %tmp16 = insertelement <8 x half> %tmp14, half %tmp15, i32 2
+ %tmp17 = extractelement <2 x half> %tmp8, i32 1
+ %tmp18 = insertelement <8 x half> %tmp16, half %tmp17, i32 3
+ %tmp19 = extractelement <2 x half> %tmp9, i32 0
+ %tmp20 = insertelement <8 x half> %tmp18, half %tmp19, i32 4
+ %tmp21 = extractelement <2 x half> %tmp9, i32 1
+ %tmp22 = insertelement <8 x half> %tmp20, half %tmp21, i32 5
+ %tmp23 = extractelement <2 x half> %tmp10, i32 0
+ %tmp24 = insertelement <8 x half> %tmp22, half %tmp23, i32 6
+ %tmp25 = extractelement <2 x half> %tmp10, i32 1
+ %tmp26 = insertelement <8 x half> %tmp24, half %tmp25, i32 7
+ store <8 x half> %tmp26, <8 x half>* %c
+ ret void
+}