From 7dde04014c0d060be90b1c5f106fcae2274c4403 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 18 Jul 2019 14:33:25 +0000 Subject: [PATCH] [X86] EltsFromConsecutiveLoads - support common source loads This patch enables us to find the source loads for each element, splitting them into a Load and ByteOffset, and attempts to recognise consecutive loads that are in fact from the same source load. A helper function, findEltLoadSrc, recurses to find a LoadSDNode and determines the element's byte offset within it. When attempting to match consecutive loads, byte offsetted loads then attempt to matched against a previous load that has already been confirmed to be a consecutive match. Next step towards PR16739 - after this we just need to account for shuffling/repeated elements to create a vector load + shuffle. Differential Revision: https://reviews.llvm.org/D64551 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@366441 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 68 +++- .../X86/clear_upper_vector_element_bits.ll | 300 +++++++----------- test/CodeGen/X86/load-partial.ll | 60 +--- 3 files changed, 179 insertions(+), 249 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 0b4bf687e6c..cf624817b87 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7504,6 +7504,46 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, return SDValue(); } +// Recurse to find a LoadSDNode source and the accumulated ByteOffest. +static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { + if (ISD::isNON_EXTLoad(Elt.getNode())) { + Ld = cast(Elt); + ByteOffset = 0; + return true; + } + + switch (Elt.getOpcode()) { + case ISD::BITCAST: + case ISD::TRUNCATE: + case ISD::SCALAR_TO_VECTOR: + return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset); + case ISD::SRL: + if (isa(Elt.getOperand(1))) { + uint64_t Idx = Elt.getConstantOperandVal(1); + if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) { + ByteOffset += Idx / 8; + return true; + } + } + break; + case ISD::EXTRACT_VECTOR_ELT: + if (isa(Elt.getOperand(1))) { + SDValue Src = Elt.getOperand(0); + unsigned SrcSizeInBits = Src.getScalarValueSizeInBits(); + unsigned DstSizeInBits = Elt.getScalarValueSizeInBits(); + if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 && + findEltLoadSrc(Src, Ld, ByteOffset)) { + uint64_t Idx = Elt.getConstantOperandVal(1); + ByteOffset += Idx * (SrcSizeInBits / 8); + return true; + } + } + break; + } + + return false; +} + /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the /// elements can be replaced by a single large load which has the same value as /// a build_vector or insert_subvector whose loaded operands are 'Elts'. @@ -7521,6 +7561,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, APInt UndefMask = APInt::getNullValue(NumElems); SmallVector Loads(NumElems, nullptr); + SmallVector ByteOffsets(NumElems, 0); // For each element in the initializer, see if we've found a load, zero or an // undef. @@ -7539,13 +7580,17 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, // Each loaded element must be the correct fractional portion of the // requested vector load. - if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits()) + unsigned EltSizeInBits = Elt.getValueSizeInBits(); + if ((NumElems * EltSizeInBits) != VT.getSizeInBits()) return SDValue(); - if (!ISD::isNON_EXTLoad(Elt.getNode())) + if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i])) return SDValue(); + assert(0 <= ByteOffsets[i] && + ((ByteOffsets[i] * 8) + EltSizeInBits) <= + Loads[i]->getValueSizeInBits(0) && + "Element offset outside of load bounds"); - Loads[i] = cast(Elt); LoadMask.setBit(i); LastLoadedElt = i; } @@ -7575,6 +7620,20 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits; assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected"); + // Check to see if the element's load is consecutive to the base load + // or offset from a previous (already checked) load. + auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) { + LoadSDNode *Ld = Loads[EltIdx]; + int64_t ByteOffset = ByteOffsets[EltIdx]; + if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) { + int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes); + return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] && + Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0); + } + return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes, + EltIdx - FirstLoadedElt); + }; + // Consecutive loads can contain UNDEFS but not ZERO elements. // Consecutive loads with UNDEFs and ZEROs elements require a // an additional shuffle stage to clear the ZERO elements. @@ -7582,8 +7641,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, bool IsConsecutiveLoadWithZeros = true; for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) { if (LoadMask[i]) { - if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes, - i - FirstLoadedElt)) { + if (!CheckConsecutiveLoad(LDBase, i)) { IsConsecutiveLoad = false; IsConsecutiveLoadWithZeros = false; break; diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll index 983c7342603..78487bd162d 100644 --- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -985,99 +985,54 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { ; AVX1-LABEL: _clearupper32xi8b: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %r9 -; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX1-NEXT: movq %r9, %r8 -; AVX1-NEXT: shrq $56, %r8 -; AVX1-NEXT: andl $15, %r8d -; AVX1-NEXT: movq %rcx, %rsi -; AVX1-NEXT: movq %rcx, %rdi -; AVX1-NEXT: movq %rcx, %rdx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: shrq $32, %rax -; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: shlq $32, %rax -; AVX1-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX1-NEXT: orq %rax, %rcx -; AVX1-NEXT: movq %r9, %rax -; AVX1-NEXT: shrq $48, %rax -; AVX1-NEXT: andl $15, %eax -; AVX1-NEXT: shrq $40, %rdx -; AVX1-NEXT: andl $15, %edx -; AVX1-NEXT: shlq $40, %rdx -; AVX1-NEXT: orq %rcx, %rdx -; AVX1-NEXT: movq %r9, %rcx -; AVX1-NEXT: shrq $40, %rcx -; AVX1-NEXT: andl $15, %ecx -; AVX1-NEXT: shrq $48, %rdi -; AVX1-NEXT: andl $15, %edi -; AVX1-NEXT: shlq $48, %rdi -; AVX1-NEXT: orq %rdx, %rdi -; AVX1-NEXT: movq %r9, %rdx -; AVX1-NEXT: shrq $32, %rdx -; AVX1-NEXT: andl $15, %edx -; AVX1-NEXT: shrq $56, %rsi -; AVX1-NEXT: andl $15, %esi -; AVX1-NEXT: shlq $56, %rsi -; AVX1-NEXT: orq %rdi, %rsi -; AVX1-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: shlq $32, %rdx -; AVX1-NEXT: andl $252645135, %r9d # imm = 0xF0F0F0F -; AVX1-NEXT: orq %rdx, %r9 -; AVX1-NEXT: shlq $40, %rcx -; AVX1-NEXT: orq %r9, %rcx -; AVX1-NEXT: shlq $48, %rax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: shlq $56, %r8 -; AVX1-NEXT: orq %rax, %r8 -; AVX1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: movq %rax, %r8 -; AVX1-NEXT: movq %rax, %r9 +; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: movq %rax, %rdx ; AVX1-NEXT: movq %rax, %rsi ; AVX1-NEXT: movq %rax, %rdi -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: movl %eax, %edx -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: shrl $8, %eax -; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1 -; AVX1-NEXT: shrl $24, %ecx -; AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 ; AVX1-NEXT: shrq $32, %rdi -; AVX1-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1 +; AVX1-NEXT: andl $15, %edi +; AVX1-NEXT: shlq $32, %rdi +; AVX1-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F +; AVX1-NEXT: orq %rdi, %rax +; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rdi ; AVX1-NEXT: shrq $40, %rsi -; AVX1-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 -; AVX1-NEXT: shrq $48, %r9 -; AVX1-NEXT: vpinsrb $6, %r9d, %xmm1, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: shrq $56, %r8 -; AVX1-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $8, %ecx -; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movl %eax, %ecx -; AVX1-NEXT: shrl $24, %ecx -; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx +; AVX1-NEXT: andl $15, %esi +; AVX1-NEXT: shlq $40, %rsi +; AVX1-NEXT: orq %rax, %rsi +; AVX1-NEXT: movq %rdi, %rax +; AVX1-NEXT: shrq $48, %rdx +; AVX1-NEXT: andl $15, %edx +; AVX1-NEXT: shlq $48, %rdx +; AVX1-NEXT: orq %rsi, %rdx +; AVX1-NEXT: movq %rdi, %rsi +; AVX1-NEXT: shrq $56, %rcx +; AVX1-NEXT: andl $15, %ecx +; AVX1-NEXT: shlq $56, %rcx +; AVX1-NEXT: orq %rdx, %rcx +; AVX1-NEXT: movq %rdi, %rdx +; AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX1-NEXT: movq %rdi, %rcx ; AVX1-NEXT: shrq $32, %rcx -; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $40, %rcx -; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq $48, %rcx -; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm2, %rcx +; AVX1-NEXT: andl $15, %ecx +; AVX1-NEXT: shlq $32, %rcx +; AVX1-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F +; AVX1-NEXT: orq %rcx, %rdi +; AVX1-NEXT: shrq $40, %rdx +; AVX1-NEXT: andl $15, %edx +; AVX1-NEXT: shlq $40, %rdx +; AVX1-NEXT: orq %rdi, %rdx +; AVX1-NEXT: shrq $48, %rsi +; AVX1-NEXT: andl $15, %esi +; AVX1-NEXT: shlq $48, %rsi +; AVX1-NEXT: orq %rdx, %rsi ; AVX1-NEXT: shrq $56, %rax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: andl $15, %eax +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: shlq $56, %rax +; AVX1-NEXT: orq %rsi, %rax +; AVX1-NEXT: vmovq %xmm0, %rcx +; AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX1-NEXT: movl %ecx, %eax ; AVX1-NEXT: shrl $8, %eax ; AVX1-NEXT: vmovd %ecx, %xmm1 @@ -1097,129 +1052,85 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { ; AVX1-NEXT: movq %rcx, %rax ; AVX1-NEXT: shrq $48, %rax ; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm2, %rax +; AVX1-NEXT: vpextrq $1, %xmm0, %rax ; AVX1-NEXT: shrq $56, %rcx -; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0 ; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: shrl $8, %ecx -; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 ; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; AVX1-NEXT: movl %eax, %ecx ; AVX1-NEXT: shrl $24, %ecx -; AVX1-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq $32, %rcx -; AVX1-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq $40, %rcx -; AVX1-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq $48, %rcx -; AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; AVX1-NEXT: shrq $56, %rax -; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: _clearupper32xi8b: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %r9 -; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX2-NEXT: movq %r9, %r8 -; AVX2-NEXT: shrq $56, %r8 -; AVX2-NEXT: andl $15, %r8d -; AVX2-NEXT: movq %rcx, %rsi -; AVX2-NEXT: movq %rcx, %rdi -; AVX2-NEXT: movq %rcx, %rdx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: shrq $32, %rax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: shlq $32, %rax -; AVX2-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX2-NEXT: orq %rax, %rcx -; AVX2-NEXT: movq %r9, %rax -; AVX2-NEXT: shrq $48, %rax -; AVX2-NEXT: andl $15, %eax -; AVX2-NEXT: shrq $40, %rdx -; AVX2-NEXT: andl $15, %edx -; AVX2-NEXT: shlq $40, %rdx -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: movq %r9, %rcx -; AVX2-NEXT: shrq $40, %rcx -; AVX2-NEXT: andl $15, %ecx -; AVX2-NEXT: shrq $48, %rdi -; AVX2-NEXT: andl $15, %edi -; AVX2-NEXT: shlq $48, %rdi -; AVX2-NEXT: orq %rdx, %rdi -; AVX2-NEXT: movq %r9, %rdx -; AVX2-NEXT: shrq $32, %rdx -; AVX2-NEXT: andl $15, %edx -; AVX2-NEXT: shrq $56, %rsi -; AVX2-NEXT: andl $15, %esi -; AVX2-NEXT: shlq $56, %rsi -; AVX2-NEXT: orq %rdi, %rsi -; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: shlq $32, %rdx -; AVX2-NEXT: andl $252645135, %r9d # imm = 0xF0F0F0F -; AVX2-NEXT: orq %rdx, %r9 -; AVX2-NEXT: shlq $40, %rcx -; AVX2-NEXT: orq %r9, %rcx -; AVX2-NEXT: shlq $48, %rax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: shlq $56, %r8 -; AVX2-NEXT: orq %rax, %r8 -; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: movq %rax, %r8 -; AVX2-NEXT: movq %rax, %r9 +; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: movq %rax, %rdx ; AVX2-NEXT: movq %rax, %rsi ; AVX2-NEXT: movq %rax, %rdi -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: movl %eax, %edx -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: shrl $8, %eax -; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 -; AVX2-NEXT: shrl $16, %edx -; AVX2-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1 -; AVX2-NEXT: shrl $24, %ecx -; AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 ; AVX2-NEXT: shrq $32, %rdi -; AVX2-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1 +; AVX2-NEXT: andl $15, %edi +; AVX2-NEXT: shlq $32, %rdi +; AVX2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F +; AVX2-NEXT: orq %rdi, %rax +; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rdi ; AVX2-NEXT: shrq $40, %rsi -; AVX2-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 -; AVX2-NEXT: shrq $48, %r9 -; AVX2-NEXT: vpinsrb $6, %r9d, %xmm1, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: shrq $56, %r8 -; AVX2-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0 -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl $8, %ecx -; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl $16, %ecx -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movl %eax, %ecx -; AVX2-NEXT: shrl $24, %ecx -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: andl $15, %esi +; AVX2-NEXT: shlq $40, %rsi +; AVX2-NEXT: orq %rax, %rsi +; AVX2-NEXT: movq %rdi, %rax +; AVX2-NEXT: shrq $48, %rdx +; AVX2-NEXT: andl $15, %edx +; AVX2-NEXT: shlq $48, %rdx +; AVX2-NEXT: orq %rsi, %rdx +; AVX2-NEXT: movq %rdi, %rsi +; AVX2-NEXT: shrq $56, %rcx +; AVX2-NEXT: andl $15, %ecx +; AVX2-NEXT: shlq $56, %rcx +; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: movq %rdi, %rdx +; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; AVX2-NEXT: movq %rdi, %rcx ; AVX2-NEXT: shrq $32, %rcx -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $40, %rcx -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq $48, %rcx -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm2, %rcx +; AVX2-NEXT: andl $15, %ecx +; AVX2-NEXT: shlq $32, %rcx +; AVX2-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F +; AVX2-NEXT: orq %rcx, %rdi +; AVX2-NEXT: shrq $40, %rdx +; AVX2-NEXT: andl $15, %edx +; AVX2-NEXT: shlq $40, %rdx +; AVX2-NEXT: orq %rdi, %rdx +; AVX2-NEXT: shrq $48, %rsi +; AVX2-NEXT: andl $15, %esi +; AVX2-NEXT: shlq $48, %rsi +; AVX2-NEXT: orq %rdx, %rsi ; AVX2-NEXT: shrq $56, %rax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: andl $15, %eax +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: shlq $56, %rax +; AVX2-NEXT: orq %rsi, %rax +; AVX2-NEXT: vmovq %xmm0, %rcx +; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; AVX2-NEXT: movl %ecx, %eax ; AVX2-NEXT: shrl $8, %eax ; AVX2-NEXT: vmovd %ecx, %xmm1 @@ -1239,30 +1150,31 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { ; AVX2-NEXT: movq %rcx, %rax ; AVX2-NEXT: shrq $48, %rax ; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, %rax ; AVX2-NEXT: shrq $56, %rcx -; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0 ; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $8, %ecx -; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $16, %ecx -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movl %eax, %ecx ; AVX2-NEXT: shrl $24, %ecx -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq $32, %rcx -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq $40, %rcx -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq $48, %rcx -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: shrq $56, %rax -; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq %x4 = bitcast <32 x i8> %0 to <64 x i4> diff --git a/test/CodeGen/X86/load-partial.ll b/test/CodeGen/X86/load-partial.ll index a0a94b96f7d..236b3555d26 100644 --- a/test/CodeGen/X86/load-partial.ll +++ b/test/CodeGen/X86/load-partial.ll @@ -54,32 +54,14 @@ define <8 x float> @load_float8_float3(<4 x float>* nocapture readonly dereferen } define <4 x float> @load_float4_float3_as_float2_float(<4 x float>* nocapture readonly dereferenceable(16)) { -; SSE2-LABEL: load_float4_float3_as_float2_float: -; SSE2: # %bb.0: -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_float4_float3_as_float2_float: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_float4_float3_as_float2_float: -; SSE41: # %bb.0: -; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE41-NEXT: retq +; SSE-LABEL: load_float4_float3_as_float2_float: +; SSE: # %bb.0: +; SSE-NEXT: movups (%rdi), %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: load_float4_float3_as_float2_float: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: retq %2 = bitcast <4 x float>* %0 to <2 x float>* %3 = load <2 x float>, <2 x float>* %2, align 4 @@ -94,36 +76,14 @@ define <4 x float> @load_float4_float3_as_float2_float(<4 x float>* nocapture re } define <4 x float> @load_float4_float3_trunc(<4 x float>* nocapture readonly dereferenceable(16)) { -; SSE2-LABEL: load_float4_float3_trunc: -; SSE2: # %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_float4_float3_trunc: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_float4_float3_trunc: -; SSE41: # %bb.0: -; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE41-NEXT: retq +; SSE-LABEL: load_float4_float3_trunc: +; SSE: # %bb.0: +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: load_float4_float3_trunc: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: vmovaps (%rdi), %xmm0 ; AVX-NEXT: retq %2 = bitcast <4 x float>* %0 to i64* %3 = load i64, i64* %2, align 16 -- 2.40.0