From 727659d0c80ece9ead74d03f8430236b1b7f482d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 9 Jun 2017 17:29:52 +0000 Subject: [PATCH] [X86][SSE] Add support for PACKSS nodes to faux shuffle extraction If the inputs won't saturate during packing then we can treat the PACKSS as a truncation shuffle git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@305091 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 38 +- test/CodeGen/X86/vector-compare-results.ll | 538 ++++++++++----------- 2 files changed, 297 insertions(+), 279 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 759ddf58deb..831e9bdab0e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1,4 +1,4 @@ - + //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// // // The LLVM Compiler Infrastructure @@ -5816,7 +5816,8 @@ static bool setTargetShuffleZeroElements(SDValue N, // The decoded shuffle mask may contain a different number of elements to the // destination value type. static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, - SmallVectorImpl &Ops) { + SmallVectorImpl &Ops, + SelectionDAG &DAG) { Mask.clear(); Ops.clear(); @@ -5924,6 +5925,19 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, Mask.push_back(i == InIdx ? NumElts + ExIdx : i); return true; } + case X86ISD::PACKSS: { + // If we know input saturation won't happen we can treat this + // as a truncation shuffle. + if (DAG.ComputeNumSignBits(N.getOperand(0)) <= NumBitsPerElt || + DAG.ComputeNumSignBits(N.getOperand(1)) <= NumBitsPerElt) + return false; + + Ops.push_back(N.getOperand(0)); + Ops.push_back(N.getOperand(1)); + for (unsigned i = 0; i != NumElts; ++i) + Mask.push_back(i * 2); + return true; + } case X86ISD::VSHLI: case X86ISD::VSRLI: { uint64_t ShiftVal = N.getConstantOperandVal(1); @@ -5998,9 +6012,10 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl &Inputs, /// Returns true if the target shuffle mask was decoded. static bool resolveTargetShuffleInputs(SDValue Op, SmallVectorImpl &Inputs, - SmallVectorImpl &Mask) { + SmallVectorImpl &Mask, + SelectionDAG &DAG) { if (!setTargetShuffleZeroElements(Op, Mask, Inputs)) - if (!getFauxShuffleMask(Op, Mask, Inputs)) + if (!getFauxShuffleMask(Op, Mask, Inputs, DAG)) return false; resolveTargetShuffleInputsAndMask(Inputs, Mask); @@ -26760,6 +26775,17 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( return Tmp; } + case X86ISD::VSHLI: { + SDValue Src = Op.getOperand(0); + unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); + APInt ShiftVal = cast(Op.getOperand(1))->getAPIntValue(); + if (ShiftVal.uge(VTBits)) + return VTBits; // Shifted all bits out --> zero. + if (ShiftVal.uge(Tmp)) + return 1; // Shifted all sign bits out --> unknown. + return Tmp - ShiftVal.getZExtValue(); + } + case X86ISD::VSRAI: { SDValue Src = Op.getOperand(0); unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); @@ -27908,7 +27934,7 @@ static bool combineX86ShufflesRecursively(ArrayRef SrcOps, // Extract target shuffle mask and resolve sentinels and inputs. SmallVector OpMask; SmallVector OpInputs; - if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask)) + if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG)) return false; assert(OpInputs.size() <= 2 && "Too many shuffle inputs"); @@ -29450,7 +29476,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, // Resolve the target shuffle inputs and mask. SmallVector Mask; SmallVector Ops; - if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask)) + if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG)) return SDValue(); // Attempt to narrow/widen the shuffle mask to the correct size. diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll index 4fa9596192a..ce0b067f504 100644 --- a/test/CodeGen/X86/vector-compare-results.ll +++ b/test/CodeGen/X86/vector-compare-results.ll @@ -5345,217 +5345,213 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind { ; ; AVX1-LABEL: test_cmp_v64i16: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9 -; AVX1-NEXT: vpcmpgtw %xmm8, %xmm9, %xmm8 -; AVX1-NEXT: vpcmpgtw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm8, %xmm0, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX1-NEXT: vpcmpgtw %xmm4, %xmm0, %xmm8 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtw %xmm5, %xmm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpcmpgtw %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpgtw %xmm6, %xmm2, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vpcmpgtw %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpacksswb %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpcmpgtw %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpgtw %xmm7, %xmm3, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vpcmpgtw %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpacksswb %xmm0, %xmm3, %xmm3 -; AVX1-NEXT: vpextrb $15, %xmm3, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, 4(%rdi) ; AVX1-NEXT: vpextrb $14, %xmm3, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $13, %xmm3, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) ; AVX1-NEXT: vpextrb $12, %xmm3, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $11, %xmm3, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) ; AVX1-NEXT: vpextrb $10, %xmm3, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $9, %xmm3, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) ; AVX1-NEXT: vpextrb $8, %xmm3, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $7, %xmm3, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) ; AVX1-NEXT: vpextrb $6, %xmm3, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $5, %xmm3, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) ; AVX1-NEXT: vpextrb $4, %xmm3, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $3, %xmm3, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) ; AVX1-NEXT: vpextrb $2, %xmm3, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $1, %xmm3, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) ; AVX1-NEXT: vpextrb $0, %xmm3, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $15, %xmm2, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $14, %xmm6, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $14, %xmm2, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $12, %xmm6, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $13, %xmm2, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $10, %xmm6, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $12, %xmm2, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $8, %xmm6, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $11, %xmm2, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $6, %xmm6, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $10, %xmm2, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $4, %xmm6, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $9, %xmm2, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $2, %xmm6, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $8, %xmm2, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $0, %xmm6, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $7, %xmm2, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $14, %xmm2, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $6, %xmm2, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $12, %xmm2, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $5, %xmm2, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $10, %xmm2, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $4, %xmm2, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $8, %xmm2, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $3, %xmm2, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $6, %xmm2, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $2, %xmm2, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $4, %xmm2, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $1, %xmm2, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $2, %xmm2, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) ; AVX1-NEXT: vpextrb $0, %xmm2, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, 4(%rdi) +; AVX1-NEXT: vpextrb $14, %xmm5, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, 4(%rdi) +; AVX1-NEXT: vpextrb $12, %xmm5, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, 4(%rdi) +; AVX1-NEXT: vpextrb $10, %xmm5, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, 4(%rdi) +; AVX1-NEXT: vpextrb $8, %xmm5, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, 4(%rdi) +; AVX1-NEXT: vpextrb $6, %xmm5, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, 4(%rdi) +; AVX1-NEXT: vpextrb $4, %xmm5, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, 4(%rdi) +; AVX1-NEXT: vpextrb $2, %xmm5, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, 4(%rdi) +; AVX1-NEXT: vpextrb $0, %xmm5, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, 4(%rdi) -; AVX1-NEXT: vpextrb $15, %xmm1, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, (%rdi) ; AVX1-NEXT: vpextrb $14, %xmm1, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $13, %xmm1, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) ; AVX1-NEXT: vpextrb $12, %xmm1, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $11, %xmm1, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) ; AVX1-NEXT: vpextrb $10, %xmm1, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $9, %xmm1, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) ; AVX1-NEXT: vpextrb $8, %xmm1, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $7, %xmm1, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) ; AVX1-NEXT: vpextrb $6, %xmm1, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $5, %xmm1, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) ; AVX1-NEXT: vpextrb $4, %xmm1, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $3, %xmm1, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) ; AVX1-NEXT: vpextrb $2, %xmm1, %eax -; AVX1-NEXT: andb $1, %al -; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $1, %xmm1, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) ; AVX1-NEXT: vpextrb $0, %xmm1, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $15, %xmm8, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $14, %xmm4, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $14, %xmm8, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $12, %xmm4, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $13, %xmm8, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $10, %xmm4, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $12, %xmm8, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $8, %xmm4, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $11, %xmm8, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $6, %xmm4, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $10, %xmm8, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $4, %xmm4, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $9, %xmm8, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $2, %xmm4, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $8, %xmm8, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $0, %xmm4, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $7, %xmm8, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $14, %xmm0, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $6, %xmm8, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $12, %xmm0, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $5, %xmm8, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $10, %xmm0, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $4, %xmm8, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $8, %xmm0, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $3, %xmm8, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $6, %xmm0, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $2, %xmm8, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $4, %xmm0, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) -; AVX1-NEXT: vpextrb $1, %xmm8, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: vpextrb $2, %xmm0, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rdi) +; AVX1-NEXT: vpextrb $0, %xmm0, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rdi) +; AVX1-NEXT: vpextrb $14, %xmm8, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rdi) +; AVX1-NEXT: vpextrb $12, %xmm8, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rdi) +; AVX1-NEXT: vpextrb $10, %xmm8, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rdi) +; AVX1-NEXT: vpextrb $8, %xmm8, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rdi) +; AVX1-NEXT: vpextrb $6, %xmm8, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rdi) +; AVX1-NEXT: vpextrb $4, %xmm8, %eax +; AVX1-NEXT: andl $1, %eax +; AVX1-NEXT: movb %al, (%rdi) +; AVX1-NEXT: vpextrb $2, %xmm8, %eax +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) ; AVX1-NEXT: vpextrb $0, %xmm8, %eax -; AVX1-NEXT: andb $1, %al +; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: movb %al, (%rdi) ; AVX1-NEXT: movq %rdi, %rax ; AVX1-NEXT: vzeroupper @@ -5565,207 +5561,203 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind { ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 ; AVX2-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm6 ; AVX2-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpacksswb %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpextrb $15, %xmm3, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm7 +; AVX2-NEXT: vpextrb $14, %xmm7, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $14, %xmm3, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $12, %xmm7, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $13, %xmm3, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $10, %xmm7, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $12, %xmm3, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $8, %xmm7, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $11, %xmm3, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $6, %xmm7, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $10, %xmm3, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $4, %xmm7, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $9, %xmm3, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $2, %xmm7, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $8, %xmm3, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $0, %xmm7, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $7, %xmm3, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $14, %xmm3, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $6, %xmm3, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $12, %xmm3, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $5, %xmm3, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $10, %xmm3, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $4, %xmm3, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $8, %xmm3, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $3, %xmm3, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $6, %xmm3, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $2, %xmm3, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $4, %xmm3, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $1, %xmm3, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $2, %xmm3, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) ; AVX2-NEXT: vpextrb $0, %xmm3, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $15, %xmm2, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $14, %xmm6, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $14, %xmm2, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $12, %xmm6, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $13, %xmm2, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $10, %xmm6, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $12, %xmm2, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $8, %xmm6, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $11, %xmm2, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $6, %xmm6, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $10, %xmm2, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $4, %xmm6, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $9, %xmm2, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $2, %xmm6, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $8, %xmm2, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $0, %xmm6, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $7, %xmm2, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $14, %xmm2, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $6, %xmm2, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $12, %xmm2, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $5, %xmm2, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $10, %xmm2, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $4, %xmm2, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $8, %xmm2, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $3, %xmm2, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $6, %xmm2, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $2, %xmm2, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $4, %xmm2, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $1, %xmm2, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $2, %xmm2, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) ; AVX2-NEXT: vpextrb $0, %xmm2, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, 4(%rdi) -; AVX2-NEXT: vpextrb $15, %xmm1, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $14, %xmm5, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $14, %xmm1, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $12, %xmm5, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $13, %xmm1, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $10, %xmm5, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $12, %xmm1, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $8, %xmm5, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $11, %xmm1, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $6, %xmm5, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $10, %xmm1, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $4, %xmm5, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $9, %xmm1, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $2, %xmm5, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $8, %xmm1, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $0, %xmm5, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $7, %xmm1, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $14, %xmm1, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $6, %xmm1, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $12, %xmm1, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $5, %xmm1, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $10, %xmm1, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $4, %xmm1, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $8, %xmm1, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $3, %xmm1, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $6, %xmm1, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $2, %xmm1, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $4, %xmm1, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $1, %xmm1, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $2, %xmm1, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) ; AVX2-NEXT: vpextrb $0, %xmm1, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $15, %xmm0, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $14, %xmm4, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $14, %xmm0, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $12, %xmm4, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $13, %xmm0, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $10, %xmm4, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $12, %xmm0, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $8, %xmm4, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $11, %xmm0, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $6, %xmm4, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $10, %xmm0, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $4, %xmm4, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $9, %xmm0, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $2, %xmm4, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $8, %xmm0, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $0, %xmm4, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $7, %xmm0, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $14, %xmm0, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $6, %xmm0, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $12, %xmm0, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $5, %xmm0, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $10, %xmm0, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $4, %xmm0, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $8, %xmm0, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $3, %xmm0, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $6, %xmm0, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $2, %xmm0, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $4, %xmm0, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) -; AVX2-NEXT: vpextrb $1, %xmm0, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: vpextrb $2, %xmm0, %eax +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) ; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: andb $1, %al +; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: movb %al, (%rdi) ; AVX2-NEXT: movq %rdi, %rax ; AVX2-NEXT: vzeroupper -- 2.40.0