From 1622e8f7490c0853bdfdf5a3e01378f3f810aed7 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 22 Feb 2019 07:03:25 +0000 Subject: [PATCH] [LegalizeVectorOps] Improve the placement of ANDs in the ExpandLoad path for non-byte-sized loads. When we need to merge two adjacent loads the AND mask for the low piece was still sized for the full src element size. But we didn't have that many bits. The upper bits are already zero due to the SRL. So we can skip the AND if we're going to combine with the high bits. We do need an AND to clear out any bits from the high part. We were anding the high part before combining with the low part, but it looks like ANDing after the OR gets better results. So we can just emit the final AND after the optional concatentation is done. That will handling skipping before the OR and get rid of extra high bits after the OR. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@354655 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../SelectionDAG/LegalizeVectorOps.cpp | 13 +- test/CodeGen/X86/vector-zext-widen.ll | 152 ++++++++++-------- test/CodeGen/X86/vector-zext.ll | 152 ++++++++++-------- 3 files changed, 181 insertions(+), 136 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index f022d2405ae..3b57955cb5e 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -654,21 +654,21 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) { LoadChains.push_back(ScalarLoad.getValue(1)); } - // Extract bits, pack and extend/trunc them into destination type. - unsigned SrcEltBits = SrcEltVT.getSizeInBits(); - SDValue SrcEltBitMask = DAG.getConstant((1U << SrcEltBits) - 1, dl, WideVT); - unsigned BitOffset = 0; unsigned WideIdx = 0; unsigned WideBits = WideVT.getSizeInBits(); + // Extract bits, pack and extend/trunc them into destination type. + unsigned SrcEltBits = SrcEltVT.getSizeInBits(); + SDValue SrcEltBitMask = DAG.getConstant( + APInt::getLowBitsSet(WideBits, SrcEltBits), dl, WideVT); + for (unsigned Idx = 0; Idx != NumElem; ++Idx) { assert(BitOffset < WideBits && "Unexpected offset!"); SDValue ShAmt = DAG.getConstant( BitOffset, dl, TLI.getShiftAmountTy(WideVT, DAG.getDataLayout())); SDValue Lo = DAG.getNode(ISD::SRL, dl, WideVT, LoadVals[WideIdx], ShAmt); - Lo = DAG.getNode(ISD::AND, dl, WideVT, Lo, SrcEltBitMask); BitOffset += SrcEltBits; if (BitOffset >= WideBits) { @@ -680,11 +680,12 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) { TLI.getShiftAmountTy(WideVT, DAG.getDataLayout())); SDValue Hi = DAG.getNode(ISD::SHL, dl, WideVT, LoadVals[WideIdx], ShAmt); - Hi = DAG.getNode(ISD::AND, dl, WideVT, Hi, SrcEltBitMask); Lo = DAG.getNode(ISD::OR, dl, WideVT, Lo, Hi); } } + Lo = DAG.getNode(ISD::AND, dl, WideVT, Lo, SrcEltBitMask); + switch (ExtType) { default: llvm_unreachable("Unknown extended-load op!"); case ISD::EXTLOAD: diff --git a/test/CodeGen/X86/vector-zext-widen.ll b/test/CodeGen/X86/vector-zext-widen.ll index 5e9c2a6b111..4d7a4f36659 100644 --- a/test/CodeGen/X86/vector-zext-widen.ll +++ b/test/CodeGen/X86/vector-zext-widen.ll @@ -2285,58 +2285,49 @@ define <2 x i32> @zext_2i8_to_2i32(<2 x i8>* %addr) { define <4 x i32> @zext_4i17_to_4i32(<4 x i17>* %ptr) { ; SSE2-LABEL: zext_4i17_to_4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movl 8(%rdi), %eax -; SSE2-NEXT: andl $15, %eax -; SSE2-NEXT: shll $13, %eax -; SSE2-NEXT: movq (%rdi), %rcx -; SSE2-NEXT: movq %rcx, %rdx -; SSE2-NEXT: shrq $51, %rdx -; SSE2-NEXT: orl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: shrq $34, %rax -; SSE2-NEXT: andl $131071, %eax # imm = 0x1FFFF -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: andl $131071, %eax # imm = 0x1FFFF +; SSE2-NEXT: movq (%rdi), %rax ; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq $17, %rcx -; SSE2-NEXT: andl $131071, %ecx # imm = 0x1FFFF -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movl 8(%rdi), %ecx +; SSE2-NEXT: shll $13, %ecx +; SSE2-NEXT: movq %rax, %rdx +; SSE2-NEXT: shrq $51, %rdx +; SSE2-NEXT: orl %ecx, %edx +; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: shrq $34, %rax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_4i17_to_4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movl 8(%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: shll $13, %eax -; SSSE3-NEXT: movq (%rdi), %rcx -; SSSE3-NEXT: movq %rcx, %rdx -; SSSE3-NEXT: shrq $51, %rdx -; SSSE3-NEXT: orl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: movq %rcx, %rax -; SSSE3-NEXT: shrq $34, %rax -; SSSE3-NEXT: andl $131071, %eax # imm = 0x1FFFF -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: andl $131071, %eax # imm = 0x1FFFF +; SSSE3-NEXT: movq (%rdi), %rax ; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movq %rax, %rcx ; SSSE3-NEXT: shrq $17, %rcx -; SSSE3-NEXT: andl $131071, %ecx # imm = 0x1FFFF -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movl 8(%rdi), %ecx +; SSSE3-NEXT: shll $13, %ecx +; SSSE3-NEXT: movq %rax, %rdx +; SSSE3-NEXT: shrq $51, %rdx +; SSSE3-NEXT: orl %ecx, %edx +; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: shrq $34, %rax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_4i17_to_4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movl 8(%rdi), %eax -; SSE41-NEXT: andl $15, %eax ; SSE41-NEXT: shll $13, %eax ; SSE41-NEXT: movq (%rdi), %rcx ; SSE41-NEXT: movq %rcx, %rdx @@ -2344,38 +2335,69 @@ define <4 x i32> @zext_4i17_to_4i32(<4 x i17>* %ptr) { ; SSE41-NEXT: orl %eax, %edx ; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: shrq $17, %rax -; SSE41-NEXT: andl $131071, %eax # imm = 0x1FFFF -; SSE41-NEXT: movl %ecx, %esi -; SSE41-NEXT: andl $131071, %esi # imm = 0x1FFFF -; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: movd %ecx, %xmm0 ; SSE41-NEXT: pinsrd $1, %eax, %xmm0 ; SSE41-NEXT: shrq $34, %rcx -; SSE41-NEXT: andl $131071, %ecx # imm = 0x1FFFF ; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 ; SSE41-NEXT: pinsrd $3, %edx, %xmm0 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: zext_4i17_to_4i32: -; AVX: # %bb.0: -; AVX-NEXT: movl 8(%rdi), %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: shll $13, %eax -; AVX-NEXT: movq (%rdi), %rcx -; AVX-NEXT: movq %rcx, %rdx -; AVX-NEXT: shrq $51, %rdx -; AVX-NEXT: orl %eax, %edx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: shrq $17, %rax -; AVX-NEXT: andl $131071, %eax # imm = 0x1FFFF -; AVX-NEXT: movl %ecx, %esi -; AVX-NEXT: andl $131071, %esi # imm = 0x1FFFF -; AVX-NEXT: vmovd %esi, %xmm0 -; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: shrq $34, %rcx -; AVX-NEXT: andl $131071, %ecx # imm = 0x1FFFF -; AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: zext_4i17_to_4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: movl 8(%rdi), %eax +; AVX1-NEXT: shll $13, %eax +; AVX1-NEXT: movq (%rdi), %rcx +; AVX1-NEXT: movq %rcx, %rdx +; AVX1-NEXT: shrq $51, %rdx +; AVX1-NEXT: orl %eax, %edx +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: shrq $17, %rax +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: shrq $34, %rcx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_4i17_to_4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: movl 8(%rdi), %eax +; AVX2-NEXT: shll $13, %eax +; AVX2-NEXT: movq (%rdi), %rcx +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: shrq $51, %rdx +; AVX2-NEXT: orl %eax, %edx +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: shrq $17, %rax +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: shrq $34, %rcx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [131071,131071,131071,131071] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_4i17_to_4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: movl 8(%rdi), %eax +; AVX512-NEXT: shll $13, %eax +; AVX512-NEXT: movq (%rdi), %rcx +; AVX512-NEXT: movq %rcx, %rdx +; AVX512-NEXT: shrq $51, %rdx +; AVX512-NEXT: orl %eax, %edx +; AVX512-NEXT: movq %rcx, %rax +; AVX512-NEXT: shrq $17, %rax +; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX512-NEXT: shrq $34, %rcx +; AVX512-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [131071,131071,131071,131071] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %a = load <4 x i17>, <4 x i17>* %ptr %b = zext <4 x i17> %a to <4 x i32> ret <4 x i32> %b diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll index 80f9022eead..d1983483412 100644 --- a/test/CodeGen/X86/vector-zext.ll +++ b/test/CodeGen/X86/vector-zext.ll @@ -2299,58 +2299,49 @@ define <2 x i32> @zext_2i8_to_2i32(<2 x i8>* %addr) { define <4 x i32> @zext_4i17_to_4i32(<4 x i17>* %ptr) { ; SSE2-LABEL: zext_4i17_to_4i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movl 8(%rdi), %eax -; SSE2-NEXT: andl $15, %eax -; SSE2-NEXT: shll $13, %eax -; SSE2-NEXT: movq (%rdi), %rcx -; SSE2-NEXT: movq %rcx, %rdx -; SSE2-NEXT: shrq $51, %rdx -; SSE2-NEXT: orl %eax, %edx -; SSE2-NEXT: movd %edx, %xmm0 -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: shrq $34, %rax -; SSE2-NEXT: andl $131071, %eax # imm = 0x1FFFF -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movl %ecx, %eax -; SSE2-NEXT: andl $131071, %eax # imm = 0x1FFFF +; SSE2-NEXT: movq (%rdi), %rax ; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: movq %rax, %rcx ; SSE2-NEXT: shrq $17, %rcx -; SSE2-NEXT: andl $131071, %ecx # imm = 0x1FFFF -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movl 8(%rdi), %ecx +; SSE2-NEXT: shll $13, %ecx +; SSE2-NEXT: movq %rax, %rdx +; SSE2-NEXT: shrq $51, %rdx +; SSE2-NEXT: orl %ecx, %edx +; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: shrq $34, %rax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: zext_4i17_to_4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movl 8(%rdi), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: shll $13, %eax -; SSSE3-NEXT: movq (%rdi), %rcx -; SSSE3-NEXT: movq %rcx, %rdx -; SSSE3-NEXT: shrq $51, %rdx -; SSSE3-NEXT: orl %eax, %edx -; SSSE3-NEXT: movd %edx, %xmm0 -; SSSE3-NEXT: movq %rcx, %rax -; SSSE3-NEXT: shrq $34, %rax -; SSSE3-NEXT: andl $131071, %eax # imm = 0x1FFFF -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movl %ecx, %eax -; SSSE3-NEXT: andl $131071, %eax # imm = 0x1FFFF +; SSSE3-NEXT: movq (%rdi), %rax ; SSSE3-NEXT: movd %eax, %xmm0 +; SSSE3-NEXT: movq %rax, %rcx ; SSSE3-NEXT: shrq $17, %rcx -; SSSE3-NEXT: andl $131071, %ecx # imm = 0x1FFFF -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movl 8(%rdi), %ecx +; SSSE3-NEXT: shll $13, %ecx +; SSSE3-NEXT: movq %rax, %rdx +; SSSE3-NEXT: shrq $51, %rdx +; SSSE3-NEXT: orl %ecx, %edx +; SSSE3-NEXT: movd %edx, %xmm1 +; SSSE3-NEXT: shrq $34, %rax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: zext_4i17_to_4i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movl 8(%rdi), %eax -; SSE41-NEXT: andl $15, %eax ; SSE41-NEXT: shll $13, %eax ; SSE41-NEXT: movq (%rdi), %rcx ; SSE41-NEXT: movq %rcx, %rdx @@ -2358,38 +2349,69 @@ define <4 x i32> @zext_4i17_to_4i32(<4 x i17>* %ptr) { ; SSE41-NEXT: orl %eax, %edx ; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: shrq $17, %rax -; SSE41-NEXT: andl $131071, %eax # imm = 0x1FFFF -; SSE41-NEXT: movl %ecx, %esi -; SSE41-NEXT: andl $131071, %esi # imm = 0x1FFFF -; SSE41-NEXT: movd %esi, %xmm0 +; SSE41-NEXT: movd %ecx, %xmm0 ; SSE41-NEXT: pinsrd $1, %eax, %xmm0 ; SSE41-NEXT: shrq $34, %rcx -; SSE41-NEXT: andl $131071, %ecx # imm = 0x1FFFF ; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 ; SSE41-NEXT: pinsrd $3, %edx, %xmm0 +; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: zext_4i17_to_4i32: -; AVX: # %bb.0: -; AVX-NEXT: movl 8(%rdi), %eax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: shll $13, %eax -; AVX-NEXT: movq (%rdi), %rcx -; AVX-NEXT: movq %rcx, %rdx -; AVX-NEXT: shrq $51, %rdx -; AVX-NEXT: orl %eax, %edx -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: shrq $17, %rax -; AVX-NEXT: andl $131071, %eax # imm = 0x1FFFF -; AVX-NEXT: movl %ecx, %esi -; AVX-NEXT: andl $131071, %esi # imm = 0x1FFFF -; AVX-NEXT: vmovd %esi, %xmm0 -; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: shrq $34, %rcx -; AVX-NEXT: andl $131071, %ecx # imm = 0x1FFFF -; AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: zext_4i17_to_4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: movl 8(%rdi), %eax +; AVX1-NEXT: shll $13, %eax +; AVX1-NEXT: movq (%rdi), %rcx +; AVX1-NEXT: movq %rcx, %rdx +; AVX1-NEXT: shrq $51, %rdx +; AVX1-NEXT: orl %eax, %edx +; AVX1-NEXT: movq %rcx, %rax +; AVX1-NEXT: shrq $17, %rax +; AVX1-NEXT: vmovd %ecx, %xmm0 +; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX1-NEXT: shrq $34, %rcx +; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: zext_4i17_to_4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: movl 8(%rdi), %eax +; AVX2-NEXT: shll $13, %eax +; AVX2-NEXT: movq (%rdi), %rcx +; AVX2-NEXT: movq %rcx, %rdx +; AVX2-NEXT: shrq $51, %rdx +; AVX2-NEXT: orl %eax, %edx +; AVX2-NEXT: movq %rcx, %rax +; AVX2-NEXT: shrq $17, %rax +; AVX2-NEXT: vmovd %ecx, %xmm0 +; AVX2-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX2-NEXT: shrq $34, %rcx +; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [131071,131071,131071,131071] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: zext_4i17_to_4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: movl 8(%rdi), %eax +; AVX512-NEXT: shll $13, %eax +; AVX512-NEXT: movq (%rdi), %rcx +; AVX512-NEXT: movq %rcx, %rdx +; AVX512-NEXT: shrq $51, %rdx +; AVX512-NEXT: orl %eax, %edx +; AVX512-NEXT: movq %rcx, %rax +; AVX512-NEXT: shrq $17, %rax +; AVX512-NEXT: vmovd %ecx, %xmm0 +; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX512-NEXT: shrq $34, %rcx +; AVX512-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; AVX512-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [131071,131071,131071,131071] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %a = load <4 x i17>, <4 x i17>* %ptr %b = zext <4 x i17> %a to <4 x i32> ret <4 x i32> %b -- 2.40.0