From abb4a55f1361e246226ee3f8afd01e357c82343f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sat, 12 Aug 2017 17:43:25 +0000 Subject: [PATCH] [DAGCombiner] Extending pattern detection for vector shuffle (REAPPLIED) If all the operands of a BUILD_VECTOR extract elements from same vector then split the vector efficiently based on the maximum vector access index. Reapplied with fix to only work with simple value types. Committed on behalf of @jbhateja (Jatin Bhateja) Differential Revision: https://reviews.llvm.org/D35788 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@310782 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 49 +++++++++++- test/CodeGen/X86/shuffle-vs-trunc-512.ll | 92 ++++++---------------- test/CodeGen/X86/vector-shuffle-512-v16.ll | 11 +-- test/CodeGen/X86/vector-shuffle-512-v8.ll | 13 ++- test/CodeGen/X86/x86-interleaved-access.ll | 62 +++++++-------- 5 files changed, 109 insertions(+), 118 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 1522022fe75..4cdac46ee6b 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14186,10 +14186,18 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, EVT InVT1 = VecIn1.getValueType(); EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1; - unsigned Vec2Offset = InVT1.getVectorNumElements(); + unsigned Vec2Offset = 0; unsigned NumElems = VT.getVectorNumElements(); unsigned ShuffleNumElems = NumElems; + // In case both the input vectors are extracted from same base + // vector we do not need extra addend (Vec2Offset) while + // computing shuffle mask. + if (!VecIn2 || !(VecIn1.getOpcode() == ISD::EXTRACT_SUBVECTOR) || + !(VecIn2.getOpcode() == ISD::EXTRACT_SUBVECTOR) || + !(VecIn1.getOperand(0) == VecIn2.getOperand(0))) + Vec2Offset = InVT1.getVectorNumElements(); + // We can't generate a shuffle node with mismatched input and output types. // Try to make the types match the type of the output. if (InVT1 != VT || InVT2 != VT) { @@ -14336,7 +14344,6 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Op.getOperand(1))) return SDValue(); - SDValue ExtractedFromVec = Op.getOperand(0); // All inputs must have the same element type as the output. @@ -14359,6 +14366,44 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { if (VecIn.size() < 2) return SDValue(); + // If all the Operands of BUILD_VECTOR extract from same + // vector, then split the vector efficiently based on the maximum + // vector access index and adjust the VectorMask and + // VecIn accordingly. + if (VecIn.size() == 2) { + unsigned MaxIndex = 0; + unsigned NearestPow2 = 0; + SDValue Vec = VecIn.back(); + EVT InVT = Vec.getValueType(); + MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); + SmallVector IndexVec(NumElems, 0); + + for (unsigned i = 0; i < NumElems; i++) { + if (VectorMask[i] <= 0) + continue; + unsigned Index = N->getOperand(i).getConstantOperandVal(1); + IndexVec[i] = Index; + MaxIndex = std::max(MaxIndex, Index); + } + + NearestPow2 = PowerOf2Ceil(MaxIndex); + if (InVT.isSimple() && (NearestPow2 > 2) && ((NumElems * 2) < NearestPow2)) { + unsigned SplitSize = NearestPow2 / 2; + EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), + InVT.getVectorElementType(), SplitSize); + SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec, + DAG.getConstant(SplitSize, DL, IdxTy)); + SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec, + DAG.getConstant(0, DL, IdxTy)); + VecIn.pop_back(); + VecIn.push_back(VecIn1); + VecIn.push_back(VecIn2); + + for (unsigned i = 0; i < NumElems; i++) + VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2; + } + } + // TODO: We want to sort the vectors by descending length, so that adjacent // pairs have similar length, and the longer vector is always first in the // pair. diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll index 86e3fc110d3..05947829edb 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -261,81 +261,33 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ ; ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax -; AVX512BW-NEXT: vpextrb $1, %xmm0, %ecx -; AVX512BW-NEXT: vmovd %ecx, %xmm1 -; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax -; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $13, %xmm0, %eax -; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2 -; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax -; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $1, %xmm0, %eax -; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax -; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax -; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX512BW-NEXT: vpextrb $14, %xmm0, %eax -; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vpextrb $5, %xmm0, %eax -; AVX512BWVL-NEXT: vpextrb $1, %xmm0, %ecx -; AVX512BWVL-NEXT: vmovd %ecx, %xmm1 -; AVX512BWVL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $9, %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $13, %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vextracti32x4 $1, %zmm0, %xmm2 -; AVX512BWVL-NEXT: vpextrb $1, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $5, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $9, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $13, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; AVX512BWVL-NEXT: vpextrb $1, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $5, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $9, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $13, %xmm2, %eax -; AVX512BWVL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; AVX512BWVL-NEXT: vpextrb $1, %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $5, %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $9, %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpextrb $14, %xmm0, %eax -; AVX512BWVL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] +; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll index 4637a3f8860..0f00f94f020 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -286,13 +286,10 @@ define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) { define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) { ; ALL-LABEL: test_v16i32_0_1_2_12: ; ALL: # BB#0: -; ALL-NEXT: vpextrd $1, %xmm0, %eax -; ALL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1 -; ALL-NEXT: vpextrd $2, %xmm0, %eax -; ALL-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; ALL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; ALL-NEXT: vextracti32x8 $1, %zmm0, %ymm1 +; ALL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; ALL-NEXT: vpbroadcastd %xmm1, %xmm1 +; ALL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %res = shufflevector <16 x i32> %v, <16 x i32> undef, <4 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll index d20d8669432..f8268cb6bc1 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -2726,20 +2726,17 @@ define <4 x i64> @test_v8i64_1257 (<8 x i64> %v) { define <2 x i64> @test_v8i64_2_5 (<8 x i64> %v) { ; AVX512F-LABEL: test_v8i64_2_5: ; AVX512F: # BB#0: -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: test_v8i64_2_5: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vextracti32x4 $1, %zmm0, %xmm1 -; AVX512F-32-NEXT: vextracti32x4 $2, %zmm0, %xmm0 -; AVX512F-32-NEXT: vpextrd $2, %xmm0, %eax -; AVX512F-32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 -; AVX512F-32-NEXT: vpextrd $3, %xmm0, %eax -; AVX512F-32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-32-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512F-32-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = shufflevector <8 x i64> %v, <8 x i64> undef, <2 x i32> diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index 6a3fe0c93ab..b2760f9ad82 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -567,37 +567,37 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) { ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX2-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm6 -; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7 -; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm5 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = +; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm5 +; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm7 +; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm5 +; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-NEXT: vpcmpeqb %xmm0, %xmm4, %xmm0 +; AVX2-NEXT: vpcmpeqb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpand %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -836,15 +836,15 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) { ; AVX512-NEXT: vpmovdw %zmm1, %ymm3 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 ; AVX512-NEXT: vpmovwb %zmm2, %ymm8 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm14 ; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm9 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = ; AVX512-NEXT: vpshufb %xmm7, %xmm9, %xmm4 ; AVX512-NEXT: vpshufb %xmm7, %xmm14, %xmm5 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm6 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm4 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -- 2.50.1