From e71312fb28dc7e0cff6b0670e67d0141a73f26ed Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 30 Aug 2019 17:35:08 +0000 Subject: [PATCH] [X86] Pass v32i16/v64i8 in zmm registers on KNL target. gcc and icc pass these types in zmm registers in zmm registers. This patch implements a quick hack to override the register type before calling convention handling to one that is legal. Longer term we might want to do something similar to 256-bit integer registers on AVX1 where we just split all the operations. Fixes PR42957 Differential Revision: https://reviews.llvm.org/D66708 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@370495 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/ReleaseNotes.rst | 4 + lib/Target/X86/X86ISelLowering.cpp | 15 + test/CodeGen/X86/all-ones-vector.ll | 56 +- test/CodeGen/X86/avg-mask.ll | 106 +-- test/CodeGen/X86/avg.ll | 7 +- test/CodeGen/X86/avx512-calling-conv.ll | 125 ++- test/CodeGen/X86/avx512-ext.ll | 156 ++-- test/CodeGen/X86/avx512-insert-extract.ll | 35 +- test/CodeGen/X86/avx512-logic.ll | 32 +- test/CodeGen/X86/avx512-mask-op.ll | 306 ++++---- test/CodeGen/X86/avx512-select.ll | 80 +- test/CodeGen/X86/avx512-trunc.ll | 2 + test/CodeGen/X86/avx512-vbroadcast.ll | 4 +- test/CodeGen/X86/avx512-vbroadcasti128.ll | 28 +- test/CodeGen/X86/avx512-vbroadcasti256.ll | 28 +- test/CodeGen/X86/bitcast-and-setcc-512.ll | 50 +- .../X86/bitcast-int-to-vector-bool-zext.ll | 23 +- test/CodeGen/X86/bitcast-setcc-512.ll | 18 +- .../X86/broadcast-elm-cross-splat-vec.ll | 296 ++++---- test/CodeGen/X86/build-vector-512.ll | 648 ++++++---------- test/CodeGen/X86/combine-sdiv.ll | 14 +- test/CodeGen/X86/fast-isel-nontemporal.ll | 48 +- test/CodeGen/X86/fast-isel-vecload.ll | 56 +- test/CodeGen/X86/kshift.ll | 144 ++-- test/CodeGen/X86/madd.ll | 16 +- test/CodeGen/X86/masked_store_trunc.ll | 11 +- test/CodeGen/X86/masked_store_trunc_ssat.ll | 7 +- test/CodeGen/X86/masked_store_trunc_usat.ll | 13 +- .../X86/merge-consecutive-loads-512.ll | 16 +- test/CodeGen/X86/midpoint-int-vec-512.ll | 716 ++++++++++-------- test/CodeGen/X86/movmsk-cmp.ll | 12 + test/CodeGen/X86/nontemporal-loads-2.ll | 34 +- test/CodeGen/X86/nontemporal-loads.ll | 120 +-- test/CodeGen/X86/pmul.ll | 47 +- test/CodeGen/X86/pmulh.ll | 14 +- test/CodeGen/X86/shuffle-vs-trunc-512.ll | 4 + test/CodeGen/X86/subvector-broadcast.ll | 360 ++------- test/CodeGen/X86/var-permute-512.ll | 508 +++++++------ test/CodeGen/X86/vec_shift6.ll | 24 +- test/CodeGen/X86/vector-bitreverse.ll | 42 +- test/CodeGen/X86/vector-compare-results.ll | 34 +- test/CodeGen/X86/vector-fshl-512.ll | 576 +++++++------- test/CodeGen/X86/vector-fshl-rot-512.ll | 452 ++++++----- test/CodeGen/X86/vector-fshr-512.ll | 594 ++++++++------- test/CodeGen/X86/vector-fshr-rot-512.ll | 472 ++++++------ test/CodeGen/X86/vector-idiv-sdiv-512.ll | 200 ++--- test/CodeGen/X86/vector-idiv-udiv-512.ll | 216 +++--- test/CodeGen/X86/vector-lzcnt-512.ll | 172 +++-- test/CodeGen/X86/vector-popcnt-512.ll | 66 +- test/CodeGen/X86/vector-reduce-and-bool.ll | 4 + test/CodeGen/X86/vector-reduce-mul.ll | 4 + test/CodeGen/X86/vector-reduce-or-bool.ll | 4 + test/CodeGen/X86/vector-reduce-xor-bool.ll | 4 + test/CodeGen/X86/vector-rotate-512.ll | 488 ++++++------ test/CodeGen/X86/vector-sext.ll | 16 +- test/CodeGen/X86/vector-shift-ashr-512.ll | 160 ++-- test/CodeGen/X86/vector-shift-lshr-512.ll | 118 +-- test/CodeGen/X86/vector-shift-shl-512.ll | 112 +-- test/CodeGen/X86/vector-shuffle-512-v32.ll | 119 +-- test/CodeGen/X86/vector-shuffle-512-v64.ll | 232 +++--- test/CodeGen/X86/vector-shuffle-avx512.ll | 52 +- test/CodeGen/X86/vector-shuffle-v1.ll | 50 +- test/CodeGen/X86/vector-trunc-packus.ll | 2 + test/CodeGen/X86/vector-trunc-ssat.ll | 2 + test/CodeGen/X86/vector-trunc-usat.ll | 2 + test/CodeGen/X86/vector-trunc.ll | 2 + test/CodeGen/X86/vector-tzcnt-512.ll | 208 ++--- test/CodeGen/X86/vector-zext.ll | 8 +- test/CodeGen/X86/viabs.ll | 8 +- 69 files changed, 4427 insertions(+), 4175 deletions(-) diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst index c9f588636df..10f76e527f0 100644 --- a/docs/ReleaseNotes.rst +++ b/docs/ReleaseNotes.rst @@ -93,6 +93,10 @@ Changes to the X86 Target now stored in the lower bits of an xmm register and the upper bits are undefined. Previously the elements were spread apart with undefined bits in between them. +* v32i8 and v64i8 vectors with AVX512F enabled, but AVX512BW disabled will now + be passed in ZMM registers for calls and returns. Previously they were passed + in two YMM registers. Old behavior can be enabled by passing + -x86-enable-old-knl-abi Changes to the AMDGPU Target ----------------------------- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 3db7a4605c4..1130fcf9c36 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -78,6 +78,13 @@ static cl::opt ExperimentalPrefLoopAlignment( " of the loop header PC will be 0)."), cl::Hidden); +// Added in 10.0. +static cl::opt EnableOldKNLABI( + "x86-enable-old-knl-abi", cl::init(false), + cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of " + "one ZMM register on AVX512F, but not AVX512BW targets."), + cl::Hidden); + static cl::opt MulConstantOptimization( "mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " @@ -1960,6 +1967,10 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, EVT VT) const { if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return MVT::v32i8; + // FIXME: Should we just make these types legal and custom split operations? + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && + Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI) + return MVT::v16i32; return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } @@ -1968,6 +1979,10 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, EVT VT) const { if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI()) return 1; + // FIXME: Should we just make these types legal and custom split operations? + if ((VT == MVT::v32i16 || VT == MVT::v64i8) && + Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI) + return 1; return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } diff --git a/test/CodeGen/X86/all-ones-vector.ll b/test/CodeGen/X86/all-ones-vector.ll index d64b3d7e29b..cead6acb14a 100644 --- a/test/CodeGen/X86/all-ones-vector.ll +++ b/test/CodeGen/X86/all-ones-vector.ll @@ -466,16 +466,10 @@ define <64 x i8> @allones_v64i8() nounwind { ; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX2-NEXT: retl ; -; X32-KNL-LABEL: allones_v64i8: -; X32-KNL: # %bb.0: -; X32-KNL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; X32-KNL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; X32-KNL-NEXT: retl -; -; X32-SKX-LABEL: allones_v64i8: -; X32-SKX: # %bb.0: -; X32-SKX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; X32-SKX-NEXT: retl +; X32-AVX512-LABEL: allones_v64i8: +; X32-AVX512: # %bb.0: +; X32-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; X32-AVX512-NEXT: retl ; ; X64-SSE-LABEL: allones_v64i8: ; X64-SSE: # %bb.0: @@ -498,16 +492,10 @@ define <64 x i8> @allones_v64i8() nounwind { ; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X64-AVX2-NEXT: retq ; -; X64-KNL-LABEL: allones_v64i8: -; X64-KNL: # %bb.0: -; X64-KNL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; X64-KNL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; X64-KNL-NEXT: retq -; -; X64-SKX-LABEL: allones_v64i8: -; X64-SKX: # %bb.0: -; X64-SKX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; X64-SKX-NEXT: retq +; X64-AVX512-LABEL: allones_v64i8: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: retq ret <64 x i8> } @@ -533,16 +521,10 @@ define <32 x i16> @allones_v32i16() nounwind { ; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX2-NEXT: retl ; -; X32-KNL-LABEL: allones_v32i16: -; X32-KNL: # %bb.0: -; X32-KNL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; X32-KNL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; X32-KNL-NEXT: retl -; -; X32-SKX-LABEL: allones_v32i16: -; X32-SKX: # %bb.0: -; X32-SKX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; X32-SKX-NEXT: retl +; X32-AVX512-LABEL: allones_v32i16: +; X32-AVX512: # %bb.0: +; X32-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; X32-AVX512-NEXT: retl ; ; X64-SSE-LABEL: allones_v32i16: ; X64-SSE: # %bb.0: @@ -565,16 +547,10 @@ define <32 x i16> @allones_v32i16() nounwind { ; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X64-AVX2-NEXT: retq ; -; X64-KNL-LABEL: allones_v32i16: -; X64-KNL: # %bb.0: -; X64-KNL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; X64-KNL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; X64-KNL-NEXT: retq -; -; X64-SKX-LABEL: allones_v32i16: -; X64-SKX: # %bb.0: -; X64-SKX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; X64-SKX-NEXT: retq +; X64-AVX512-LABEL: allones_v32i16: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: retq ret <32 x i16> } diff --git a/test/CodeGen/X86/avg-mask.ll b/test/CodeGen/X86/avg-mask.ll index 3519faa8536..a7ce07ab0cd 100644 --- a/test/CodeGen/X86/avg-mask.ll +++ b/test/CodeGen/X86/avg-mask.ll @@ -123,30 +123,33 @@ define <32 x i8> @avg_v32i8_maskz(<32 x i8> %a, <32 x i8> %b, i32 %mask) nounwin define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 %mask) nounwind { ; AVX512F-LABEL: avg_v64i8_mask: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: movq %rdi, %rcx +; AVX512F-NEXT: movl %edi, %ecx ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: movl %edi, %edx -; AVX512F-NEXT: shrl $16, %edx -; AVX512F-NEXT: shrq $32, %rax -; AVX512F-NEXT: shrq $48, %rcx -; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: shrq $32, %rdi +; AVX512F-NEXT: shrq $48, %rax +; AVX512F-NEXT: shrl $16, %ecx +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512F-NEXT: vpavgb %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: kmovw %edx, %k4 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k4} {z} -; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z} -; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} +; AVX512F-NEXT: kmovw %edi, %k4 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z} +; AVX512F-NEXT: vpmovdb %zmm5, %xmm5 +; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm1 +; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1 +; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} +; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 +; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BWVL-LABEL: avg_v64i8_mask: @@ -170,29 +173,31 @@ define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwin ; AVX512F-LABEL: avg_v64i8_maskz: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: movq %rdi, %rcx +; AVX512F-NEXT: movl %edi, %ecx ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: movl %edi, %edx -; AVX512F-NEXT: shrl $16, %edx -; AVX512F-NEXT: shrq $32, %rax -; AVX512F-NEXT: shrq $48, %rcx -; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: shrq $32, %rdi +; AVX512F-NEXT: shrq $48, %rax +; AVX512F-NEXT: shrl $16, %ecx +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpavgb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: kmovw %edx, %k4 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k4} {z} +; AVX512F-NEXT: kmovw %edi, %k4 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} +; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 -; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z} +; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z} ; AVX512F-NEXT: vpmovdb %zmm3, %xmm3 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BWVL-LABEL: avg_v64i8_maskz: @@ -322,17 +327,21 @@ define <16 x i16> @avg_v16i16_maskz(<16 x i16> %a, <16 x i16> %b, i16 %mask) nou define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src, i32 %mask) nounwind { ; AVX512F-LABEL: avg_v32i16_mask: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: shrl $16, %edi -; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512F-NEXT: vpavgw %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} -; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm1 +; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BWVL-LABEL: avg_v32i16_mask: @@ -357,15 +366,18 @@ define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nou ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: shrl $16, %edi -; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpavgw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: kmovw %edi, %k2 +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 ; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} -; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 -; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BWVL-LABEL: avg_v32i16_maskz: diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll index e2819151ce7..a5fd84c32ed 100644 --- a/test/CodeGen/X86/avg.ll +++ b/test/CodeGen/X86/avg.ll @@ -1537,8 +1537,11 @@ define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind { ; ; AVX512F-LABEL: avg_v64i8_3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpavgb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: avg_v64i8_3: diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll index 4145fe98c77..5bbe76c609a 100644 --- a/test/CodeGen/X86/avx512-calling-conv.ll +++ b/test/CodeGen/X86/avx512-calling-conv.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL +; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL --check-prefix=KNL-NEW +; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=knl -x86-enable-old-knl-abi | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL --check-prefix=KNL-OLD ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=skx | FileCheck %s --check-prefix=ALL_X64 --check-prefix=SKX ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=KNL_X32 @@ -397,3 +398,125 @@ define <1 x i1> @test13(<1 x i1>* %foo) { %bar = load <1 x i1>, <1 x i1>* %foo ret <1 x i1> %bar } + +define void @test14(<32 x i16>* %x) { +; KNL-NEW-LABEL: test14: +; KNL-NEW: ## %bb.0: +; KNL-NEW-NEXT: pushq %rbx +; KNL-NEW-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEW-NEXT: .cfi_offset %rbx, -16 +; KNL-NEW-NEXT: movq %rdi, %rbx +; KNL-NEW-NEXT: vmovaps (%rdi), %zmm0 +; KNL-NEW-NEXT: callq _test14_callee +; KNL-NEW-NEXT: vmovaps %zmm0, (%rbx) +; KNL-NEW-NEXT: popq %rbx +; KNL-NEW-NEXT: retq +; +; KNL-OLD-LABEL: test14: +; KNL-OLD: ## %bb.0: +; KNL-OLD-NEXT: pushq %rbx +; KNL-OLD-NEXT: .cfi_def_cfa_offset 16 +; KNL-OLD-NEXT: .cfi_offset %rbx, -16 +; KNL-OLD-NEXT: movq %rdi, %rbx +; KNL-OLD-NEXT: vmovaps (%rdi), %ymm0 +; KNL-OLD-NEXT: vmovaps 32(%rdi), %ymm1 +; KNL-OLD-NEXT: callq _test14_callee +; KNL-OLD-NEXT: vmovaps %ymm1, 32(%rbx) +; KNL-OLD-NEXT: vmovaps %ymm0, (%rbx) +; KNL-OLD-NEXT: popq %rbx +; KNL-OLD-NEXT: retq +; +; SKX-LABEL: test14: +; SKX: ## %bb.0: +; SKX-NEXT: pushq %rbx +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: .cfi_offset %rbx, -16 +; SKX-NEXT: movq %rdi, %rbx +; SKX-NEXT: vmovaps (%rdi), %zmm0 +; SKX-NEXT: callq _test14_callee +; SKX-NEXT: vmovaps %zmm0, (%rbx) +; SKX-NEXT: popq %rbx +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq +; +; KNL_X32-LABEL: test14: +; KNL_X32: ## %bb.0: +; KNL_X32-NEXT: pushl %esi +; KNL_X32-NEXT: .cfi_def_cfa_offset 8 +; KNL_X32-NEXT: subl $8, %esp +; KNL_X32-NEXT: .cfi_def_cfa_offset 16 +; KNL_X32-NEXT: .cfi_offset %esi, -8 +; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; KNL_X32-NEXT: vmovaps (%esi), %zmm0 +; KNL_X32-NEXT: calll _test14_callee +; KNL_X32-NEXT: vmovaps %zmm0, (%esi) +; KNL_X32-NEXT: addl $8, %esp +; KNL_X32-NEXT: popl %esi +; KNL_X32-NEXT: retl + %a = load <32 x i16>, <32 x i16>* %x + %b = call <32 x i16> @test14_callee(<32 x i16> %a) + store <32 x i16> %b, <32 x i16>* %x + ret void +} +declare <32 x i16> @test14_callee(<32 x i16>) + +define void @test15(<64 x i8>* %x) { +; KNL-NEW-LABEL: test15: +; KNL-NEW: ## %bb.0: +; KNL-NEW-NEXT: pushq %rbx +; KNL-NEW-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEW-NEXT: .cfi_offset %rbx, -16 +; KNL-NEW-NEXT: movq %rdi, %rbx +; KNL-NEW-NEXT: vmovaps (%rdi), %zmm0 +; KNL-NEW-NEXT: callq _test15_callee +; KNL-NEW-NEXT: vmovaps %zmm0, (%rbx) +; KNL-NEW-NEXT: popq %rbx +; KNL-NEW-NEXT: retq +; +; KNL-OLD-LABEL: test15: +; KNL-OLD: ## %bb.0: +; KNL-OLD-NEXT: pushq %rbx +; KNL-OLD-NEXT: .cfi_def_cfa_offset 16 +; KNL-OLD-NEXT: .cfi_offset %rbx, -16 +; KNL-OLD-NEXT: movq %rdi, %rbx +; KNL-OLD-NEXT: vmovaps (%rdi), %ymm0 +; KNL-OLD-NEXT: vmovaps 32(%rdi), %ymm1 +; KNL-OLD-NEXT: callq _test15_callee +; KNL-OLD-NEXT: vmovaps %ymm1, 32(%rbx) +; KNL-OLD-NEXT: vmovaps %ymm0, (%rbx) +; KNL-OLD-NEXT: popq %rbx +; KNL-OLD-NEXT: retq +; +; SKX-LABEL: test15: +; SKX: ## %bb.0: +; SKX-NEXT: pushq %rbx +; SKX-NEXT: .cfi_def_cfa_offset 16 +; SKX-NEXT: .cfi_offset %rbx, -16 +; SKX-NEXT: movq %rdi, %rbx +; SKX-NEXT: vmovaps (%rdi), %zmm0 +; SKX-NEXT: callq _test15_callee +; SKX-NEXT: vmovaps %zmm0, (%rbx) +; SKX-NEXT: popq %rbx +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq +; +; KNL_X32-LABEL: test15: +; KNL_X32: ## %bb.0: +; KNL_X32-NEXT: pushl %esi +; KNL_X32-NEXT: .cfi_def_cfa_offset 8 +; KNL_X32-NEXT: subl $8, %esp +; KNL_X32-NEXT: .cfi_def_cfa_offset 16 +; KNL_X32-NEXT: .cfi_offset %esi, -8 +; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; KNL_X32-NEXT: vmovaps (%esi), %zmm0 +; KNL_X32-NEXT: calll _test15_callee +; KNL_X32-NEXT: vmovaps %zmm0, (%esi) +; KNL_X32-NEXT: addl $8, %esp +; KNL_X32-NEXT: popl %esi +; KNL_X32-NEXT: retl + %a = load <64 x i8>, <64 x i8>* %x + %b = call <64 x i8> @test15_callee(<64 x i8> %a) + store <64 x i8> %b, <64 x i8>* %x + ret void +} +declare <64 x i8> @test15_callee(<64 x i8>) diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll index ab4dd02ee2c..1e484da2fff 100644 --- a/test/CodeGen/X86/avx512-ext.ll +++ b/test/CodeGen/X86/avx512-ext.ll @@ -205,8 +205,8 @@ define <16 x i16> @sext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwi define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone { ; KNL-LABEL: zext_32x8mem_to_32x16: ; KNL: # %bb.0: -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero @@ -216,6 +216,7 @@ define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi ; KNL-NEXT: vpsllw $15, %ymm1, %ymm1 ; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 ; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_32x8mem_to_32x16: @@ -227,8 +228,8 @@ define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi ; ; AVX512DQNOBW-LABEL: zext_32x8mem_to_32x16: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero @@ -238,6 +239,7 @@ define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi ; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1 ; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1 ; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQNOBW-NEXT: retq %a = load <32 x i8>,<32 x i8> *%i,align 1 %x = zext <32 x i8> %a to <32 x i16> @@ -248,17 +250,18 @@ define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone { ; KNL-LABEL: sext_32x8mem_to_32x16: ; KNL: # %bb.0: -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; KNL-NEXT: vpmovsxbw 16(%rdi), %ymm2 -; KNL-NEXT: vpmovsxbw (%rdi), %ymm3 +; KNL-NEXT: vpmovsxbw (%rdi), %ymm2 +; KNL-NEXT: vpmovsxbw 16(%rdi), %ymm3 ; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 ; KNL-NEXT: vpand %ymm3, %ymm0, %ymm0 ; KNL-NEXT: vpsllw $15, %ymm1, %ymm1 ; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 ; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: sext_32x8mem_to_32x16: @@ -270,17 +273,18 @@ define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi ; ; AVX512DQNOBW-LABEL: sext_32x8mem_to_32x16: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512DQNOBW-NEXT: vpmovsxbw 16(%rdi), %ymm2 -; AVX512DQNOBW-NEXT: vpmovsxbw (%rdi), %ymm3 +; AVX512DQNOBW-NEXT: vpmovsxbw (%rdi), %ymm2 +; AVX512DQNOBW-NEXT: vpmovsxbw 16(%rdi), %ymm3 ; AVX512DQNOBW-NEXT: vpsllw $15, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vpsraw $15, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1 ; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1 ; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQNOBW-NEXT: retq %a = load <32 x i8>,<32 x i8> *%i,align 1 %x = sext <32 x i8> %a to <32 x i16> @@ -291,10 +295,10 @@ define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { ; KNL-LABEL: zext_32x8_to_32x16: ; KNL: # %bb.0: -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; KNL-NEXT: vmovdqa %ymm2, %ymm0 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_32x8_to_32x16: @@ -304,10 +308,10 @@ define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { ; ; AVX512DQNOBW-LABEL: zext_32x8_to_32x16: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512DQNOBW-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQNOBW-NEXT: retq %x = zext <32 x i8> %a to <32 x i16> ret <32 x i16> %x @@ -316,11 +320,11 @@ define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone { ; KNL-LABEL: zext_32x8_to_32x16_mask: ; KNL: # %bb.0: -; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm3 -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; KNL-NEXT: vpsllw $15, %ymm1, %ymm1 ; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 @@ -328,6 +332,7 @@ define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwi ; KNL-NEXT: vpsllw $15, %ymm2, %ymm1 ; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 ; KNL-NEXT: vpand %ymm3, %ymm1, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_32x8_to_32x16_mask: @@ -339,11 +344,11 @@ define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwi ; ; AVX512DQNOBW-LABEL: zext_32x8_to_32x16_mask: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1 ; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1 @@ -351,6 +356,7 @@ define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwi ; AVX512DQNOBW-NEXT: vpsllw $15, %ymm2, %ymm1 ; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1 ; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQNOBW-NEXT: retq %x = zext <32 x i8> %a to <32 x i16> %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer @@ -360,10 +366,10 @@ define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwi define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { ; KNL-LABEL: sext_32x8_to_32x16: ; KNL: # %bb.0: -; KNL-NEXT: vpmovsxbw %xmm0, %ymm2 -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vpmovsxbw %xmm0, %ymm1 -; KNL-NEXT: vmovdqa %ymm2, %ymm0 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpmovsxbw %xmm1, %ymm1 +; KNL-NEXT: vpmovsxbw %xmm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: sext_32x8_to_32x16: @@ -373,10 +379,10 @@ define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { ; ; AVX512DQNOBW-LABEL: sext_32x8_to_32x16: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm2 -; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm1 -; AVX512DQNOBW-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQNOBW-NEXT: vpmovsxbw %xmm1, %ymm1 +; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQNOBW-NEXT: retq %x = sext <32 x i8> %a to <32 x i16> ret <32 x i16> %x @@ -385,11 +391,11 @@ define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone { ; KNL-LABEL: sext_32x8_to_32x16_mask: ; KNL: # %bb.0: -; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm3 -; KNL-NEXT: vpmovsxbw %xmm3, %ymm3 +; KNL-NEXT: vpmovsxbw %xmm0, %ymm3 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovsxbw %xmm0, %ymm0 ; KNL-NEXT: vpsllw $15, %ymm1, %ymm1 ; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 @@ -397,6 +403,7 @@ define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwi ; KNL-NEXT: vpsllw $15, %ymm2, %ymm1 ; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 ; KNL-NEXT: vpand %ymm3, %ymm1, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: sext_32x8_to_32x16_mask: @@ -408,11 +415,11 @@ define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwi ; ; AVX512DQNOBW-LABEL: sext_32x8_to_32x16_mask: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX512DQNOBW-NEXT: vpmovsxbw %xmm3, %ymm3 +; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm3 +; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1 ; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1 @@ -420,6 +427,7 @@ define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwi ; AVX512DQNOBW-NEXT: vpsllw $15, %ymm2, %ymm1 ; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1 ; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQNOBW-NEXT: retq %x = sext <32 x i8> %a to <32 x i16> %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer @@ -2032,11 +2040,14 @@ define <4 x i64> @zext_8x32_to_4x64(<8 x i32> %a) { define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 { ; KNL-LABEL: zext_64xi1_to_64xi8: ; KNL: # %bb.0: -; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 -; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; KNL-NEXT: vpand %ymm2, %ymm0, %ymm0 -; KNL-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 -; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; KNL-NEXT: vpcmpeqb %ymm2, %ymm3, %ymm2 +; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; KNL-NEXT: vpand %ymm3, %ymm2, %ymm2 +; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpand %ymm3, %ymm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_64xi1_to_64xi8: @@ -2047,11 +2058,14 @@ define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 { ; ; AVX512DQNOBW-LABEL: zext_64xi1_to_64xi8: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 -; AVX512DQNOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512DQNOBW-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 -; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQNOBW-NEXT: vpcmpeqb %ymm2, %ymm3, %ymm2 +; AVX512DQNOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512DQNOBW-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <64 x i8> %x, %y %1 = zext <64 x i1> %mask to <64 x i8> @@ -2061,10 +2075,13 @@ define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 { define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 { ; KNL-LABEL: zext_32xi1_to_32xi16: ; KNL: # %bb.0: -; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; KNL-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 +; KNL-NEXT: vpsrlw $15, %ymm2, %ymm2 +; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0 -; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 -; KNL-NEXT: vpsrlw $15, %ymm1, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_32xi1_to_32xi16: @@ -2076,10 +2093,13 @@ define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 { ; ; AVX512DQNOBW-LABEL: zext_32xi1_to_32xi16: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQNOBW-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 +; AVX512DQNOBW-NEXT: vpsrlw $15, %ymm2, %ymm2 +; AVX512DQNOBW-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vpsrlw $15, %ymm0, %ymm0 -; AVX512DQNOBW-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 -; AVX512DQNOBW-NEXT: vpsrlw $15, %ymm1, %ymm1 +; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQNOBW-NEXT: retq %mask = icmp eq <32 x i16> %x, %y %1 = zext <32 x i1> %mask to <32 x i16> @@ -2101,11 +2121,13 @@ define <16 x i16> @zext_16xi1_to_16xi16(<16 x i16> %x, <16 x i16> %y) #0 { define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 { ; KNL-LABEL: zext_32xi1_to_32xi8: ; KNL: # %bb.0: -; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; KNL-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 +; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 @@ -2119,11 +2141,13 @@ define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 { ; ; AVX512DQNOBW-LABEL: zext_32xi1_to_32xi8: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQNOBW-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 +; AVX512DQNOBW-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512DQNOBW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQNOBW-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 -; AVX512DQNOBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512DQNOBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512DQNOBW-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQNOBW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQNOBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index e197d278bd7..37f16489df4 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -618,11 +618,11 @@ define <4 x i32> @insert_v4i32(<4 x i32> %x, i32 %y, i32* %ptr) { define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) { ; KNL-LABEL: insert_v32i16: ; KNL: ## %bb.0: -; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm2 -; KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 -; KNL-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; KNL-NEXT: vpinsrw $1, %edi, %xmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] ; KNL-NEXT: retq ; ; SKX-LABEL: insert_v32i16: @@ -669,11 +669,13 @@ define <8 x i16> @insert_v8i16(<8 x i16> %x, i16 %y, i16* %ptr) { define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) { ; KNL-LABEL: insert_v64i8: ; KNL: ## %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm2 ; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2 ; KNL-NEXT: vpinsrb $2, %edi, %xmm2, %xmm2 ; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_v64i8: @@ -1015,10 +1017,12 @@ define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) { define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) { ; KNL-LABEL: test_extractelement_v64i1: ; KNL: ## %bb.0: -; KNL-NEXT: vextracti128 $1, %ymm3, %xmm0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 -; KNL-NEXT: vpminub %xmm0, %xmm1, %xmm0 -; KNL-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vpminub %xmm1, %xmm0, %xmm1 +; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1049,10 +1053,12 @@ define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) { define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) { ; KNL-LABEL: extractelement_v64i1_alt: ; KNL: ## %bb.0: -; KNL-NEXT: vextracti128 $1, %ymm3, %xmm0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 -; KNL-NEXT: vpminub %xmm0, %xmm1, %xmm0 -; KNL-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vpminub %xmm1, %xmm0, %xmm1 +; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1352,7 +1358,7 @@ define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) { ; KNL-NEXT: andq $-64, %rsp ; KNL-NEXT: subq $128, %rsp ; KNL-NEXT: ## kill: def $edi killed $edi def $rdi -; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; KNL-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovaps %ymm0, (%rsp) ; KNL-NEXT: andl $31, %edi ; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax @@ -1428,7 +1434,7 @@ define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) { ; KNL-NEXT: andq $-64, %rsp ; KNL-NEXT: subq $128, %rsp ; KNL-NEXT: ## kill: def $edi killed $edi def $rdi -; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; KNL-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovaps %ymm0, (%rsp) ; KNL-NEXT: andl $63, %edi ; KNL-NEXT: movb (%rsp,%rdi), %al @@ -1470,7 +1476,7 @@ define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index) ; KNL-NEXT: andq $-64, %rsp ; KNL-NEXT: subq $128, %rsp ; KNL-NEXT: addb %dil, %dil -; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; KNL-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; KNL-NEXT: vmovaps %ymm0, (%rsp) ; KNL-NEXT: movzbl %dil, %eax ; KNL-NEXT: andl $63, %eax @@ -1764,6 +1770,7 @@ define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) { ; KNL-NEXT: andq $-64, %rsp ; KNL-NEXT: subq $128, %rsp ; KNL-NEXT: ## kill: def $esi killed $esi def $rsi +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 ; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 diff --git a/test/CodeGen/X86/avx512-logic.ll b/test/CodeGen/X86/avx512-logic.ll index 65d9d67b2ca..9d614ce6c94 100644 --- a/test/CodeGen/X86/avx512-logic.ll +++ b/test/CodeGen/X86/avx512-logic.ll @@ -166,8 +166,7 @@ entry: define <64 x i8> @and_v64i8(<64 x i8> %a, <64 x i8> %b) { ; KNL-LABEL: and_v64i8: ; KNL: ## %bb.0: -; KNL-NEXT: vandps %ymm2, %ymm0, %ymm0 -; KNL-NEXT: vandps %ymm3, %ymm1, %ymm1 +; KNL-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: and_v64i8: @@ -181,8 +180,11 @@ define <64 x i8> @and_v64i8(<64 x i8> %a, <64 x i8> %b) { define <64 x i8> @andn_v64i8(<64 x i8> %a, <64 x i8> %b) { ; KNL-LABEL: andn_v64i8: ; KNL: ## %bb.0: -; KNL-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; KNL-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; KNL-NEXT: vandnps %ymm2, %ymm3, %ymm2 +; KNL-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; KNL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: andn_v64i8: @@ -200,8 +202,7 @@ define <64 x i8> @andn_v64i8(<64 x i8> %a, <64 x i8> %b) { define <64 x i8> @or_v64i8(<64 x i8> %a, <64 x i8> %b) { ; KNL-LABEL: or_v64i8: ; KNL: ## %bb.0: -; KNL-NEXT: vorps %ymm2, %ymm0, %ymm0 -; KNL-NEXT: vorps %ymm3, %ymm1, %ymm1 +; KNL-NEXT: vpord %zmm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: or_v64i8: @@ -215,8 +216,7 @@ define <64 x i8> @or_v64i8(<64 x i8> %a, <64 x i8> %b) { define <64 x i8> @xor_v64i8(<64 x i8> %a, <64 x i8> %b) { ; KNL-LABEL: xor_v64i8: ; KNL: ## %bb.0: -; KNL-NEXT: vxorps %ymm2, %ymm0, %ymm0 -; KNL-NEXT: vxorps %ymm3, %ymm1, %ymm1 +; KNL-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: xor_v64i8: @@ -230,8 +230,7 @@ define <64 x i8> @xor_v64i8(<64 x i8> %a, <64 x i8> %b) { define <32 x i16> @and_v32i16(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: and_v32i16: ; KNL: ## %bb.0: -; KNL-NEXT: vandps %ymm2, %ymm0, %ymm0 -; KNL-NEXT: vandps %ymm3, %ymm1, %ymm1 +; KNL-NEXT: vpandd %zmm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: and_v32i16: @@ -245,8 +244,11 @@ define <32 x i16> @and_v32i16(<32 x i16> %a, <32 x i16> %b) { define <32 x i16> @andn_v32i16(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: andn_v32i16: ; KNL: ## %bb.0: -; KNL-NEXT: vandnps %ymm0, %ymm2, %ymm0 -; KNL-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; KNL-NEXT: vandnps %ymm2, %ymm3, %ymm2 +; KNL-NEXT: vandnps %ymm0, %ymm1, %ymm0 +; KNL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: andn_v32i16: @@ -262,8 +264,7 @@ define <32 x i16> @andn_v32i16(<32 x i16> %a, <32 x i16> %b) { define <32 x i16> @or_v32i16(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: or_v32i16: ; KNL: ## %bb.0: -; KNL-NEXT: vorps %ymm2, %ymm0, %ymm0 -; KNL-NEXT: vorps %ymm3, %ymm1, %ymm1 +; KNL-NEXT: vpord %zmm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: or_v32i16: @@ -277,8 +278,7 @@ define <32 x i16> @or_v32i16(<32 x i16> %a, <32 x i16> %b) { define <32 x i16> @xor_v32i16(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: xor_v32i16: ; KNL: ## %bb.0: -; KNL-NEXT: vxorps %ymm2, %ymm0, %ymm0 -; KNL-NEXT: vxorps %ymm3, %ymm1, %ymm1 +; KNL-NEXT: vpxord %zmm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: xor_v32i16: diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index 1c6cc6fe12c..f9d754aa8c6 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -1079,12 +1079,13 @@ define <64 x i8> @test16(i64 %x) { ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} -; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} +; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test16: @@ -1132,12 +1133,13 @@ define <64 x i8> @test16(i64 %x) { ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; X86-LABEL: test16: @@ -1182,12 +1184,13 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} -; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} +; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test17: @@ -1241,12 +1244,13 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; X86-LABEL: test17: @@ -1379,15 +1383,17 @@ define <8 x i1> @test18(i8 %a, i16 %y) { define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone { ; KNL-LABEL: test21: ; KNL: ## %bb.0: -; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3 -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero -; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; KNL-NEXT: vpsllw $15, %ymm1, %ymm1 +; KNL-NEXT: vpsraw $15, %ymm1, %ymm1 +; KNL-NEXT: vpand %ymm3, %ymm1, %ymm1 ; KNL-NEXT: vpsllw $15, %ymm2, %ymm2 ; KNL-NEXT: vpsraw $15, %ymm2, %ymm2 ; KNL-NEXT: vpand %ymm0, %ymm2, %ymm0 -; KNL-NEXT: vpsllw $15, %ymm3, %ymm2 -; KNL-NEXT: vpsraw $15, %ymm2, %ymm2 -; KNL-NEXT: vpand %ymm1, %ymm2, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test21: @@ -1406,15 +1412,17 @@ define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone { ; ; AVX512DQ-LABEL: test21: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQ-NEXT: vpsllw $15, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsraw $15, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsllw $15, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsraw $15, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vpsllw $15, %ymm3, %ymm2 -; AVX512DQ-NEXT: vpsraw $15, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; X86-LABEL: test21: @@ -1849,8 +1857,10 @@ define void @store_i8_i1(i8 %x, i1 *%y) { define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) { ; KNL-LABEL: test_build_vec_v32i1: ; KNL: ## %bb.0: -; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test_build_vec_v32i1: @@ -1865,8 +1875,10 @@ define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) { ; ; AVX512DQ-LABEL: test_build_vec_v32i1: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; X86-LABEL: test_build_vec_v32i1: @@ -1880,8 +1892,10 @@ define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) { define <32 x i16> @test_build_vec_v32i1_optsize(<32 x i16> %x) optsize { ; KNL-LABEL: test_build_vec_v32i1_optsize: ; KNL: ## %bb.0: -; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test_build_vec_v32i1_optsize: @@ -1900,8 +1914,10 @@ define <32 x i16> @test_build_vec_v32i1_optsize(<32 x i16> %x) optsize { ; ; AVX512DQ-LABEL: test_build_vec_v32i1_optsize: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; X86-LABEL: test_build_vec_v32i1_optsize: @@ -1917,8 +1933,10 @@ define <32 x i16> @test_build_vec_v32i1_optsize(<32 x i16> %x) optsize { define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) { ; KNL-LABEL: test_build_vec_v64i1: ; KNL: ## %bb.0: -; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: test_build_vec_v64i1: @@ -1933,8 +1951,10 @@ define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) { ; ; AVX512DQ-LABEL: test_build_vec_v64i1: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; X86-LABEL: test_build_vec_v64i1: @@ -2385,10 +2405,11 @@ define <32 x i16> @load_32i1(<32 x i1>* %a) { ; KNL: ## %bb.0: ; KNL-NEXT: kmovw (%rdi), %k1 ; KNL-NEXT: kmovw 2(%rdi), %k2 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; KNL-NEXT: vpmovdw %zmm0, %ymm0 -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdw %zmm1, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: load_32i1: @@ -2407,10 +2428,11 @@ define <32 x i16> @load_32i1(<32 x i1>* %a) { ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; X86-LABEL: load_32i1: @@ -2431,16 +2453,17 @@ define <64 x i8> @load_64i1(<64 x i1>* %a) { ; KNL-NEXT: kmovw 2(%rdi), %k2 ; KNL-NEXT: kmovw 4(%rdi), %k3 ; KNL-NEXT: kmovw 6(%rdi), %k4 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 ; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k4} {z} +; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} ; KNL-NEXT: vpmovdb %zmm2, %xmm2 ; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: load_64i1: @@ -2461,16 +2484,17 @@ define <64 x i8> @load_64i1(<64 x i1>* %a) { ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 ; AVX512DQ-NEXT: kmovw 4(%rdi), %k2 ; AVX512DQ-NEXT: kmovw 6(%rdi), %k3 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovm2d %k2, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 +; AVX512DQ-NEXT: vpmovm2d %k3, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQ-NEXT: vpmovm2d %k3, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; X86-LABEL: load_64i1: @@ -2684,6 +2708,7 @@ define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) { define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) { ; KNL-LABEL: store_32i1_1: ; KNL: ## %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -2713,6 +2738,7 @@ define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) { ; ; AVX512DQ-LABEL: store_32i1_1: ; AVX512DQ: ## %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 @@ -3918,20 +3944,24 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z) ; KNL: ## %bb.0: ; KNL-NEXT: pushq %rax ; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm7 ; KNL-NEXT: vpxor %xmm8, %xmm8, %xmm8 ; KNL-NEXT: vpcmpeqw %ymm8, %ymm0, %ymm0 +; KNL-NEXT: vpcmpeqw %ymm8, %ymm7, %ymm7 ; KNL-NEXT: vpcmpeqw %ymm8, %ymm1, %ymm1 +; KNL-NEXT: vpor %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpcmpeqw %ymm8, %ymm6, %ymm1 +; KNL-NEXT: vpor %ymm1, %ymm7, %ymm1 ; KNL-NEXT: vpcmpeqw %ymm8, %ymm2, %ymm2 -; KNL-NEXT: vpor %ymm2, %ymm0, %ymm0 -; KNL-NEXT: vpcmpeqw %ymm8, %ymm3, %ymm2 -; KNL-NEXT: vpor %ymm2, %ymm1, %ymm1 -; KNL-NEXT: vpcmpeqw %ymm8, %ymm4, %ymm2 -; KNL-NEXT: vpcmpeqw %ymm8, %ymm5, %ymm3 -; KNL-NEXT: vpcmpeqw %ymm8, %ymm6, %ymm4 -; KNL-NEXT: vpor %ymm4, %ymm2, %ymm2 +; KNL-NEXT: vpcmpeqw %ymm8, %ymm5, %ymm5 +; KNL-NEXT: vpcmpeqw %ymm8, %ymm3, %ymm3 +; KNL-NEXT: vpor %ymm3, %ymm2, %ymm2 ; KNL-NEXT: vpand %ymm2, %ymm0, %ymm0 -; KNL-NEXT: vpcmpeqw %ymm8, %ymm7, %ymm2 -; KNL-NEXT: vpor %ymm2, %ymm3, %ymm2 +; KNL-NEXT: vpcmpeqw %ymm8, %ymm4, %ymm2 +; KNL-NEXT: vpor %ymm2, %ymm5, %ymm2 ; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -4000,20 +4030,24 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z) ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: pushq %rax ; AVX512DQ-NEXT: .cfi_def_cfa_offset 16 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm6 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm7 ; AVX512DQ-NEXT: vpxor %xmm8, %xmm8, %xmm8 ; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm7, %ymm7 ; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm6, %ymm1 +; AVX512DQ-NEXT: vpor %ymm1, %ymm7, %ymm1 ; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm3, %ymm2 -; AVX512DQ-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm4, %ymm2 -; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm5, %ymm3 -; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm6, %ymm4 -; AVX512DQ-NEXT: vpor %ymm4, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpor %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm7, %ymm2 -; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm4, %ymm2 +; AVX512DQ-NEXT: vpor %ymm2, %ymm5, %ymm2 ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 @@ -4079,47 +4113,51 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) { ; KNL: ## %bb.0: ; KNL-NEXT: pushq %rax ; KNL-NEXT: .cfi_def_cfa_offset 16 +; KNL-NEXT: vextracti64x4 $1, %zmm3, %ymm9 +; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm10 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm11 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm7 ; KNL-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; KNL-NEXT: vpcmpeqb %ymm8, %ymm0, %ymm9 -; KNL-NEXT: vextracti128 $1, %ymm9, %xmm0 -; KNL-NEXT: vpcmpeqb %ymm8, %ymm1, %ymm10 -; KNL-NEXT: vextracti128 $1, %ymm10, %xmm1 -; KNL-NEXT: vpcmpeqb %ymm8, %ymm2, %ymm11 -; KNL-NEXT: vextracti128 $1, %ymm11, %xmm2 -; KNL-NEXT: vpor %xmm2, %xmm0, %xmm13 -; KNL-NEXT: vpcmpeqb %ymm8, %ymm3, %ymm2 -; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3 -; KNL-NEXT: vpor %xmm3, %xmm1, %xmm12 -; KNL-NEXT: vpcmpeqb %ymm8, %ymm4, %ymm3 -; KNL-NEXT: vextracti128 $1, %ymm3, %xmm4 -; KNL-NEXT: vpcmpeqb %ymm8, %ymm5, %ymm5 -; KNL-NEXT: vextracti128 $1, %ymm5, %xmm1 -; KNL-NEXT: vpcmpeqb %ymm8, %ymm6, %ymm6 -; KNL-NEXT: vextracti128 $1, %ymm6, %xmm0 +; KNL-NEXT: vpcmpeqb %ymm8, %ymm0, %ymm13 +; KNL-NEXT: vextracti128 $1, %ymm13, %xmm4 +; KNL-NEXT: vpcmpeqb %ymm8, %ymm7, %ymm7 +; KNL-NEXT: vextracti128 $1, %ymm7, %xmm5 +; KNL-NEXT: vpcmpeqb %ymm8, %ymm1, %ymm1 +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm6 +; KNL-NEXT: vpor %xmm6, %xmm4, %xmm12 +; KNL-NEXT: vpcmpeqb %ymm8, %ymm11, %ymm6 +; KNL-NEXT: vextracti128 $1, %ymm6, %xmm4 +; KNL-NEXT: vpor %xmm4, %xmm5, %xmm11 +; KNL-NEXT: vpcmpeqb %ymm8, %ymm2, %ymm2 +; KNL-NEXT: vextracti128 $1, %ymm2, %xmm5 +; KNL-NEXT: vpcmpeqb %ymm8, %ymm10, %ymm10 +; KNL-NEXT: vextracti128 $1, %ymm10, %xmm4 +; KNL-NEXT: vpcmpeqb %ymm8, %ymm3, %ymm3 +; KNL-NEXT: vextracti128 $1, %ymm3, %xmm0 +; KNL-NEXT: vpor %xmm0, %xmm5, %xmm0 +; KNL-NEXT: vpand %xmm0, %xmm12, %xmm12 +; KNL-NEXT: vpcmpeqb %ymm8, %ymm9, %ymm5 +; KNL-NEXT: vextracti128 $1, %ymm5, %xmm0 ; KNL-NEXT: vpor %xmm0, %xmm4, %xmm0 -; KNL-NEXT: vpand %xmm0, %xmm13, %xmm0 -; KNL-NEXT: vpcmpeqb %ymm8, %ymm7, %ymm4 -; KNL-NEXT: vextracti128 $1, %ymm4, %xmm7 -; KNL-NEXT: vpor %xmm7, %xmm1, %xmm1 -; KNL-NEXT: vpand %xmm1, %xmm12, %xmm1 -; KNL-NEXT: vpor %xmm2, %xmm10, %xmm2 -; KNL-NEXT: vpor %xmm11, %xmm9, %xmm7 -; KNL-NEXT: vpor %xmm4, %xmm5, %xmm4 -; KNL-NEXT: vpand %xmm4, %xmm2, %xmm2 -; KNL-NEXT: vpor %xmm6, %xmm3, %xmm3 -; KNL-NEXT: vpand %xmm3, %xmm7, %xmm3 -; KNL-NEXT: vpmovsxbd %xmm3, %zmm3 -; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0 +; KNL-NEXT: vpand %xmm0, %xmm11, %xmm0 +; KNL-NEXT: vpor %xmm6, %xmm7, %xmm4 +; KNL-NEXT: vpor %xmm1, %xmm13, %xmm1 +; KNL-NEXT: vpor %xmm5, %xmm10, %xmm5 +; KNL-NEXT: vpand %xmm5, %xmm4, %xmm4 +; KNL-NEXT: vpor %xmm3, %xmm2, %xmm2 +; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: vpmovsxbd %xmm12, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: orl %eax, %ecx -; KNL-NEXT: vpmovsxbd %xmm2, %zmm0 -; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 +; KNL-NEXT: vpmovsxbd %xmm4, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 ; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: vpmovsxbd %xmm1, %zmm0 +; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: shll $16, %edx @@ -4185,47 +4223,51 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) { ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: pushq %rax ; AVX512DQ-NEXT: .cfi_def_cfa_offset 16 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm9 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm10 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm11 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm7 ; AVX512DQ-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm0, %ymm9 -; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm0 -; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm1, %ymm10 -; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm1 -; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm2, %ymm11 -; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm2 -; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm13 -; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm3, %ymm2 -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm12 -; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm4, %ymm3 -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm5, %ymm5 -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm1 -; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm6, %ymm6 -; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm0 +; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm0, %ymm13 +; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm4 +; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm7, %ymm7 +; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm5 +; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm1, %ymm1 +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512DQ-NEXT: vpor %xmm6, %xmm4, %xmm12 +; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm11, %ymm6 +; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm4 +; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm11 +; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm2, %ymm2 +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm10, %ymm10 +; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm4 +; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm3, %ymm3 +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm0 +; AVX512DQ-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX512DQ-NEXT: vpand %xmm0, %xmm12, %xmm12 +; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm9, %ymm5 +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm0 ; AVX512DQ-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX512DQ-NEXT: vpand %xmm0, %xmm13, %xmm0 -; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm7, %ymm4 -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm7 -; AVX512DQ-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpand %xmm1, %xmm12, %xmm1 -; AVX512DQ-NEXT: vpor %xmm2, %xmm10, %xmm2 -; AVX512DQ-NEXT: vpor %xmm11, %xmm9, %xmm7 -; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpand %xmm3, %xmm7, %xmm3 -; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3 -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k0 +; AVX512DQ-NEXT: vpand %xmm0, %xmm11, %xmm0 +; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm4 +; AVX512DQ-NEXT: vpor %xmm1, %xmm13, %xmm1 +; AVX512DQ-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX512DQ-NEXT: vpand %xmm5, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: kmovw %k0, %eax -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovsxbd %xmm12, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: kmovw %k0, %ecx ; AVX512DQ-NEXT: shll $16, %ecx ; AVX512DQ-NEXT: orl %eax, %ecx -; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovsxbd %xmm4, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0 ; AVX512DQ-NEXT: kmovw %k0, %eax -; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm0 +; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 ; AVX512DQ-NEXT: kmovw %k0, %edx ; AVX512DQ-NEXT: shll $16, %edx diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll index 1ed7b408baf..31484c16dd5 100644 --- a/test/CodeGen/X86/avx512-select.ll +++ b/test/CodeGen/X86/avx512-select.ll @@ -436,32 +436,30 @@ define <16 x i16> @pr31515(<16 x i1> %a, <16 x i1> %b, <16 x i16> %c) nounwind { define <32 x i16> @pr42355_v32i16(i1 %c, <32 x i16> %x, <32 x i16> %y) { ; X86-AVX512F-LABEL: pr42355_v32i16: ; X86-AVX512F: # %bb.0: -; X86-AVX512F-NEXT: pushl %ebp -; X86-AVX512F-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX512F-NEXT: .cfi_offset %ebp, -8 -; X86-AVX512F-NEXT: movl %esp, %ebp -; X86-AVX512F-NEXT: .cfi_def_cfa_register %ebp -; X86-AVX512F-NEXT: andl $-32, %esp -; X86-AVX512F-NEXT: subl $32, %esp -; X86-AVX512F-NEXT: testb $1, 8(%ebp) -; X86-AVX512F-NEXT: jne .LBB14_2 -; X86-AVX512F-NEXT: # %bb.1: -; X86-AVX512F-NEXT: vmovaps 40(%ebp), %ymm1 -; X86-AVX512F-NEXT: vmovaps %ymm2, %ymm0 -; X86-AVX512F-NEXT: .LBB14_2: -; X86-AVX512F-NEXT: movl %ebp, %esp -; X86-AVX512F-NEXT: popl %ebp -; X86-AVX512F-NEXT: .cfi_def_cfa %esp, 4 +; X86-AVX512F-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-AVX512F-NEXT: jne .LBB14_1 +; X86-AVX512F-NEXT: # %bb.2: +; X86-AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; X86-AVX512F-NEXT: vmovaps %ymm1, %ymm0 +; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; X86-AVX512F-NEXT: retl +; X86-AVX512F-NEXT: .LBB14_1: +; X86-AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl ; ; X64-AVX512F-LABEL: pr42355_v32i16: ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: testb $1, %dil -; X64-AVX512F-NEXT: jne .LBB14_2 -; X64-AVX512F-NEXT: # %bb.1: -; X64-AVX512F-NEXT: vmovaps %ymm2, %ymm0 -; X64-AVX512F-NEXT: vmovaps %ymm3, %ymm1 -; X64-AVX512F-NEXT: .LBB14_2: +; X64-AVX512F-NEXT: jne .LBB14_1 +; X64-AVX512F-NEXT: # %bb.2: +; X64-AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; X64-AVX512F-NEXT: vmovaps %ymm1, %ymm0 +; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; X64-AVX512F-NEXT: retq +; X64-AVX512F-NEXT: .LBB14_1: +; X64-AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; X64-AVX512F-NEXT: retq ; ; X86-AVX512BW-LABEL: pr42355_v32i16: @@ -488,32 +486,30 @@ define <32 x i16> @pr42355_v32i16(i1 %c, <32 x i16> %x, <32 x i16> %y) { define <64 x i8> @pr42355_v64i8(i1 %c, <64 x i8> %x, <64 x i8> %y) { ; X86-AVX512F-LABEL: pr42355_v64i8: ; X86-AVX512F: # %bb.0: -; X86-AVX512F-NEXT: pushl %ebp -; X86-AVX512F-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX512F-NEXT: .cfi_offset %ebp, -8 -; X86-AVX512F-NEXT: movl %esp, %ebp -; X86-AVX512F-NEXT: .cfi_def_cfa_register %ebp -; X86-AVX512F-NEXT: andl $-32, %esp -; X86-AVX512F-NEXT: subl $32, %esp -; X86-AVX512F-NEXT: testb $1, 8(%ebp) -; X86-AVX512F-NEXT: jne .LBB15_2 -; X86-AVX512F-NEXT: # %bb.1: -; X86-AVX512F-NEXT: vmovaps 40(%ebp), %ymm1 -; X86-AVX512F-NEXT: vmovaps %ymm2, %ymm0 -; X86-AVX512F-NEXT: .LBB15_2: -; X86-AVX512F-NEXT: movl %ebp, %esp -; X86-AVX512F-NEXT: popl %ebp -; X86-AVX512F-NEXT: .cfi_def_cfa %esp, 4 +; X86-AVX512F-NEXT: testb $1, {{[0-9]+}}(%esp) +; X86-AVX512F-NEXT: jne .LBB15_1 +; X86-AVX512F-NEXT: # %bb.2: +; X86-AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; X86-AVX512F-NEXT: vmovaps %ymm1, %ymm0 +; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; X86-AVX512F-NEXT: retl +; X86-AVX512F-NEXT: .LBB15_1: +; X86-AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; X86-AVX512F-NEXT: retl ; ; X64-AVX512F-LABEL: pr42355_v64i8: ; X64-AVX512F: # %bb.0: ; X64-AVX512F-NEXT: testb $1, %dil -; X64-AVX512F-NEXT: jne .LBB15_2 -; X64-AVX512F-NEXT: # %bb.1: -; X64-AVX512F-NEXT: vmovaps %ymm2, %ymm0 -; X64-AVX512F-NEXT: vmovaps %ymm3, %ymm1 -; X64-AVX512F-NEXT: .LBB15_2: +; X64-AVX512F-NEXT: jne .LBB15_1 +; X64-AVX512F-NEXT: # %bb.2: +; X64-AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; X64-AVX512F-NEXT: vmovaps %ymm1, %ymm0 +; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; X64-AVX512F-NEXT: retq +; X64-AVX512F-NEXT: .LBB15_1: +; X64-AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; X64-AVX512F-NEXT: retq ; ; X86-AVX512BW-LABEL: pr42355_v64i8: diff --git a/test/CodeGen/X86/avx512-trunc.ll b/test/CodeGen/X86/avx512-trunc.ll index 45384a4df4f..57cabca0076 100644 --- a/test/CodeGen/X86/avx512-trunc.ll +++ b/test/CodeGen/X86/avx512-trunc.ll @@ -444,6 +444,7 @@ define void @trunc_dw_128_mem(<4 x i32> %i, <4 x i16>* %res) #0 { define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 { ; KNL-LABEL: trunc_wb_512: ; KNL: ## %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero @@ -462,6 +463,7 @@ define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 { define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 { ; KNL-LABEL: trunc_wb_512_mem: ; KNL: ## %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; KNL-NEXT: vpmovdb %zmm1, 16(%rdi) ; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll index 3277f99dce6..5e46f26f0b7 100644 --- a/test/CodeGen/X86/avx512-vbroadcast.ll +++ b/test/CodeGen/X86/avx512-vbroadcast.ll @@ -350,7 +350,7 @@ define <64 x i8> @_invec32xi8(<32 x i8>%a) { ; AVX512F-LABEL: _invec32xi8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: _invec32xi8: @@ -365,7 +365,7 @@ define <32 x i16> @_invec16xi16(<16 x i16>%a) { ; AVX512F-LABEL: _invec16xi16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: _invec16xi16: diff --git a/test/CodeGen/X86/avx512-vbroadcasti128.ll b/test/CodeGen/X86/avx512-vbroadcasti128.ll index 3ca4f9a44c5..4067690d7c4 100644 --- a/test/CodeGen/X86/avx512-vbroadcasti128.ll +++ b/test/CodeGen/X86/avx512-vbroadcasti128.ll @@ -134,9 +134,10 @@ define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind { define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind { ; X64-AVX512VL-LABEL: test_broadcast_8i16_32i16: ; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] -; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm0 -; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 +; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1 +; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: test_broadcast_8i16_32i16: @@ -147,9 +148,10 @@ define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind { ; ; X64-AVX512DQVL-LABEL: test_broadcast_8i16_32i16: ; X64-AVX512DQVL: ## %bb.0: -; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] -; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm0 -; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 +; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1 +; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; X64-AVX512DQVL-NEXT: retq %1 = load <8 x i16>, <8 x i16> *%p %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> @@ -160,9 +162,10 @@ define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind { define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind { ; X64-AVX512VL-LABEL: test_broadcast_16i8_64i8: ; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] -; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm0 -; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm1 +; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1 +; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: test_broadcast_16i8_64i8: @@ -173,9 +176,10 @@ define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind { ; ; X64-AVX512DQVL-LABEL: test_broadcast_16i8_64i8: ; X64-AVX512DQVL: ## %bb.0: -; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] -; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm0 -; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm1 +; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] +; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1 +; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; X64-AVX512DQVL-NEXT: retq %1 = load <16 x i8>, <16 x i8> *%p %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> diff --git a/test/CodeGen/X86/avx512-vbroadcasti256.ll b/test/CodeGen/X86/avx512-vbroadcasti256.ll index b7710f3237a..8b6082fafcd 100644 --- a/test/CodeGen/X86/avx512-vbroadcasti256.ll +++ b/test/CodeGen/X86/avx512-vbroadcasti256.ll @@ -54,9 +54,10 @@ define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { ; X64-AVX512VL-LABEL: test_broadcast_16i16_32i16: ; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vmovdqa (%rdi), %ymm1 -; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm0 -; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 +; X64-AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1 +; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: test_broadcast_16i16_32i16: @@ -67,9 +68,10 @@ define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { ; ; X64-AVX512DQVL-LABEL: test_broadcast_16i16_32i16: ; X64-AVX512DQVL: ## %bb.0: -; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %ymm1 -; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm0 -; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1 +; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %ymm0 +; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1 +; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; X64-AVX512DQVL-NEXT: retq %1 = load <16 x i16>, <16 x i16> *%p %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> @@ -80,9 +82,10 @@ define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind { ; X64-AVX512VL-LABEL: test_broadcast_32i8_64i8: ; X64-AVX512VL: ## %bb.0: -; X64-AVX512VL-NEXT: vmovdqa (%rdi), %ymm1 -; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm0 -; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm1 +; X64-AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1 +; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; X64-AVX512VL-NEXT: retq ; ; X64-AVX512BWVL-LABEL: test_broadcast_32i8_64i8: @@ -93,9 +96,10 @@ define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind { ; ; X64-AVX512DQVL-LABEL: test_broadcast_32i8_64i8: ; X64-AVX512DQVL: ## %bb.0: -; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %ymm1 -; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm0 -; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm1 +; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %ymm0 +; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1 +; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0 +; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; X64-AVX512DQVL-NEXT: retq %1 = load <32 x i8>, <32 x i8> *%p %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> diff --git a/test/CodeGen/X86/bitcast-and-setcc-512.ll b/test/CodeGen/X86/bitcast-and-setcc-512.ll index 2abcdee7adb..9384b24ab28 100644 --- a/test/CodeGen/X86/bitcast-and-setcc-512.ll +++ b/test/CodeGen/X86/bitcast-and-setcc-512.ll @@ -300,16 +300,20 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) { ; ; AVX512F-LABEL: v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpcmpgtw %ymm7, %ymm5, %ymm2 -; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpcmpgtw %ymm6, %ymm4, %ymm2 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512F-NEXT: vpcmpgtw %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm6 +; AVX512F-NEXT: vpcmpgtw %ymm5, %ymm6, %ymm5 +; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm1 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %ecx -; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm0 +; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: shll $16, %eax @@ -608,30 +612,34 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) { ; ; AVX512F-LABEL: v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vpcmpgtb %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512F-NEXT: vpand %xmm7, %xmm3, %xmm3 -; AVX512F-NEXT: vpcmpgtb %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm6 -; AVX512F-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX512F-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512F-NEXT: vpcmpgtb %ymm4, %ymm5, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm6 +; AVX512F-NEXT: vpcmpgtb %ymm5, %ymm6, %ymm5 +; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX512F-NEXT: vpand %xmm7, %xmm6, %xmm6 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512F-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm0 +; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: shll $16, %ecx ; AVX512F-NEXT: orl %eax, %ecx -; AVX512F-NEXT: vpand %xmm5, %xmm1, %xmm0 +; AVX512F-NEXT: vpand %xmm4, %xmm5, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %edx -; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm0 +; AVX512F-NEXT: vpmovsxbd %xmm6, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: shll $16, %eax diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index 00bbaf38366..07e5f7a029c 100644 --- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -776,12 +776,13 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: shrl $16, %edi ; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512F-NEXT: vpsrlw $15, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i32_32i16: @@ -889,26 +890,26 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) { ; AVX512F-LABEL: ext_i64_64i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: movq %rdi, %rcx +; AVX512F-NEXT: movl %edi, %ecx ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: movl %edi, %edx -; AVX512F-NEXT: shrl $16, %edx -; AVX512F-NEXT: shrq $32, %rax -; AVX512F-NEXT: shrq $48, %rcx +; AVX512F-NEXT: shrq $32, %rdi +; AVX512F-NEXT: shrq $48, %rax +; AVX512F-NEXT: shrl $16, %ecx ; AVX512F-NEXT: kmovw %ecx, %k2 ; AVX512F-NEXT: kmovw %eax, %k3 -; AVX512F-NEXT: kmovw %edx, %k4 +; AVX512F-NEXT: kmovw %edi, %k4 ; AVX512F-NEXT: movl {{.*}}(%rip), %eax -; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k4} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k4} {z} +; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k3} {z} ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k3} {z} +; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vpbroadcastd %eax, %zmm2 {%k2} {z} ; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VLBW-LABEL: ext_i64_64i8: diff --git a/test/CodeGen/X86/bitcast-setcc-512.ll b/test/CodeGen/X86/bitcast-setcc-512.ll index 9f5097bab3c..09cfb8d10db 100644 --- a/test/CodeGen/X86/bitcast-setcc-512.ll +++ b/test/CodeGen/X86/bitcast-setcc-512.ll @@ -51,12 +51,14 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b) { ; ; AVX512F-LABEL: v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %ecx -; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm0 -; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: shll $16, %eax @@ -256,10 +258,12 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) { ; ; AVX512F-LABEL: v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovmskb %ymm0, %ecx -; AVX512F-NEXT: vpmovmskb %ymm1, %eax +; AVX512F-NEXT: vpmovmskb %ymm2, %eax ; AVX512F-NEXT: shlq $32, %rax ; AVX512F-NEXT: orq %rcx, %rax ; AVX512F-NEXT: vzeroupper @@ -446,6 +450,7 @@ define void @bitcast_64i8_store(i64* %p, <64 x i8> %a0) { ; ; AVX512F-LABEL: bitcast_64i8_store: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm3 @@ -515,6 +520,7 @@ define void @bitcast_32i16_store(i32* %p, <32 x i16> %a0) { ; ; AVX512F-LABEL: bitcast_32i16_store: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 diff --git a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll index e947deea579..d4726f50a3a 100644 --- a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -301,14 +301,14 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) { ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; -; NO-AVX512BW-LABEL: f64xi8_i16: -; NO-AVX512BW: # %bb.0: -; NO-AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] -; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: retl +; AVX2-LABEL: f64xi8_i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f64xi8_i16: ; AVX512BW: # %bb.0: @@ -333,14 +333,14 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) { ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; -; NO-AVX512BW-64-LABEL: f64xi8_i16: -; NO-AVX512BW-64: # %bb.0: -; NO-AVX512BW-64-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] -; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: retq +; AVX2-64-LABEL: f64xi8_i16: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256] +; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f64xi8_i16: ; AVX512BW-64: # %bb.0: @@ -370,14 +370,14 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) { ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; -; NO-AVX512BW-LABEL: f64i8_i32: -; NO-AVX512BW: # %bb.0: -; NO-AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] -; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: retl +; AVX2-LABEL: f64i8_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] +; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f64i8_i32: ; AVX512BW: # %bb.0: @@ -401,14 +401,14 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) { ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; -; NO-AVX512BW-64-LABEL: f64i8_i32: -; NO-AVX512BW-64: # %bb.0: -; NO-AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] -; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: retq +; AVX2-64-LABEL: f64i8_i32: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] +; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f64i8_i32: ; AVX512BW-64: # %bb.0: @@ -438,14 +438,14 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) { ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; -; NO-AVX512BW-LABEL: f64xi8_i64: -; NO-AVX512BW: # %bb.0: -; NO-AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275] -; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: retl +; AVX2-LABEL: f64xi8_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275] +; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f64xi8_i64: ; AVX512BW: # %bb.0: @@ -469,14 +469,14 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) { ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; -; NO-AVX512BW-64-LABEL: f64xi8_i64: -; NO-AVX512BW-64: # %bb.0: -; NO-AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] -; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: retq +; AVX2-64-LABEL: f64xi8_i64: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] +; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f64xi8_i64: ; AVX512BW-64: # %bb.0: @@ -507,15 +507,15 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) { ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; -; NO-AVX512BW-LABEL: f64xi8_i128: -; NO-AVX512BW: # %bb.0: -; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NO-AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: retl +; AVX2-LABEL: f64xi8_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f64xi8_i128: ; AVX512BW: # %bb.0: @@ -541,15 +541,15 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) { ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; -; NO-AVX512BW-64-LABEL: f64xi8_i128: -; NO-AVX512BW-64: # %bb.0: -; NO-AVX512BW-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NO-AVX512BW-64-NEXT: # ymm2 = mem[0,1,0,1] -; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: retq +; AVX2-64-LABEL: f64xi8_i128: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f64xi8_i128: ; AVX512BW-64: # %bb.0: @@ -582,14 +582,14 @@ define <64 x i8> @f64xi8_i256(<64 x i8> %a) { ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; -; NO-AVX512BW-LABEL: f64xi8_i256: -; NO-AVX512BW: # %bb.0: -; NO-AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] -; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: retl +; AVX2-LABEL: f64xi8_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f64xi8_i256: ; AVX512BW: # %bb.0: @@ -616,14 +616,14 @@ define <64 x i8> @f64xi8_i256(<64 x i8> %a) { ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; -; NO-AVX512BW-64-LABEL: f64xi8_i256: -; NO-AVX512BW-64: # %bb.0: -; NO-AVX512BW-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] -; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: retq +; AVX2-64-LABEL: f64xi8_i256: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] +; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f64xi8_i256: ; AVX512BW-64: # %bb.0: @@ -848,14 +848,14 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) { ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; -; NO-AVX512BW-LABEL: f32xi16_i32: -; NO-AVX512BW: # %bb.0: -; NO-AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536] -; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: retl +; AVX2-LABEL: f32xi16_i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536] +; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f32xi16_i32: ; AVX512BW: # %bb.0: @@ -879,14 +879,14 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) { ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; -; NO-AVX512BW-64-LABEL: f32xi16_i32: -; NO-AVX512BW-64: # %bb.0: -; NO-AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536] -; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: retq +; AVX2-64-LABEL: f32xi16_i32: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536] +; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f32xi16_i32: ; AVX512BW-64: # %bb.0: @@ -916,14 +916,14 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) { ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; -; NO-AVX512BW-LABEL: f32xi16_i64: -; NO-AVX512BW: # %bb.0: -; NO-AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309] -; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: retl +; AVX2-LABEL: f32xi16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309] +; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f32xi16_i64: ; AVX512BW: # %bb.0: @@ -947,14 +947,14 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) { ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; -; NO-AVX512BW-64-LABEL: f32xi16_i64: -; NO-AVX512BW-64: # %bb.0: -; NO-AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096] -; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: retq +; AVX2-64-LABEL: f32xi16_i64: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096] +; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f32xi16_i64: ; AVX512BW-64: # %bb.0: @@ -985,15 +985,15 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) { ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; -; NO-AVX512BW-LABEL: f32xi16_i128: -; NO-AVX512BW: # %bb.0: -; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; NO-AVX512BW-NEXT: # ymm2 = mem[0,1,0,1] -; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: retl +; AVX2-LABEL: f32xi16_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f32xi16_i128: ; AVX512BW: # %bb.0: @@ -1019,15 +1019,15 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) { ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; -; NO-AVX512BW-64-LABEL: f32xi16_i128: -; NO-AVX512BW-64: # %bb.0: -; NO-AVX512BW-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; NO-AVX512BW-64-NEXT: # ymm2 = mem[0,1,0,1] -; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: retq +; AVX2-64-LABEL: f32xi16_i128: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f32xi16_i128: ; AVX512BW-64: # %bb.0: @@ -1060,14 +1060,14 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) { ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; -; NO-AVX512BW-LABEL: f32xi16_i256: -; NO-AVX512BW: # %bb.0: -; NO-AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-NEXT: retl +; AVX2-LABEL: f32xi16_i256: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retl ; ; AVX512BW-LABEL: f32xi16_i256: ; AVX512BW: # %bb.0: @@ -1094,14 +1094,14 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) { ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; -; NO-AVX512BW-64-LABEL: f32xi16_i256: -; NO-AVX512BW-64: # %bb.0: -; NO-AVX512BW-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0 -; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1 -; NO-AVX512BW-64-NEXT: retq +; AVX2-64-LABEL: f32xi16_i256: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq ; ; AVX512BW-64-LABEL: f32xi16_i256: ; AVX512BW-64: # %bb.0: diff --git a/test/CodeGen/X86/build-vector-512.ll b/test/CodeGen/X86/build-vector-512.ll index aba8b13db96..4bc731cbf02 100644 --- a/test/CodeGen/X86/build-vector-512.ll +++ b/test/CodeGen/X86/build-vector-512.ll @@ -156,159 +156,83 @@ define <16 x i32> @test_buildvector_v16i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i } define <32 x i16> @test_buildvector_v32i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15, i16 %a16, i16 %a17, i16 %a18, i16 %a19, i16 %a20, i16 %a21, i16 %a22, i16 %a23, i16 %a24, i16 %a25, i16 %a26, i16 %a27, i16 %a28, i16 %a29, i16 %a30, i16 %a31) { -; AVX512F-32-LABEL: test_buildvector_v32i16: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-32-NEXT: retl -; -; AVX512F-64-LABEL: test_buildvector_v32i16: -; AVX512F-64: # %bb.0: -; AVX512F-64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512F-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-64-NEXT: vmovd %edi, %xmm0 -; AVX512F-64-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrw $4, %r8d, %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrw $5, %r9d, %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512F-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-64-NEXT: retq -; -; AVX512BW-32-LABEL: test_buildvector_v32i16: -; AVX512BW-32: # %bb.0: -; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512BW-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512BW-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512BW-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512BW-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512BW-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-32-NEXT: retl +; AVX-32-LABEL: test_buildvector_v32i16: +; AVX-32: # %bb.0: +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX-32-NEXT: retl ; -; AVX512BW-64-LABEL: test_buildvector_v32i16: -; AVX512BW-64: # %bb.0: -; AVX512BW-64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512BW-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512BW-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-64-NEXT: vmovd %edi, %xmm1 -; AVX512BW-64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrw $4, %r8d, %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrw $5, %r9d, %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512BW-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512BW-64-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-64-NEXT: retq +; AVX-64-LABEL: test_buildvector_v32i16: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vmovd %edi, %xmm1 +; AVX-64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrw $4, %r8d, %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrw $5, %r9d, %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX-64-NEXT: retq %ins0 = insertelement <32 x i16> undef, i16 %a0, i32 0 %ins1 = insertelement <32 x i16> %ins0, i16 %a1, i32 1 %ins2 = insertelement <32 x i16> %ins1, i16 %a2, i32 2 @@ -345,287 +269,147 @@ define <32 x i16> @test_buildvector_v32i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i } define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31, i8 %a32, i8 %a33, i8 %a34, i8 %a35, i8 %a36, i8 %a37, i8 %a38, i8 %a39, i8 %a40, i8 %a41, i8 %a42, i8 %a43, i8 %a44, i8 %a45, i8 %a46, i8 %a47, i8 %a48, i8 %a49, i8 %a50, i8 %a51, i8 %a52, i8 %a53, i8 %a54, i8 %a55, i8 %a56, i8 %a57, i8 %a58, i8 %a59, i8 %a60, i8 %a61, i8 %a62, i8 %a63) { -; AVX512F-32-LABEL: test_buildvector_v64i8: -; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512F-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512F-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512F-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-32-NEXT: retl -; -; AVX512F-64-LABEL: test_buildvector_v64i8: -; AVX512F-64: # %bb.0: -; AVX512F-64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512F-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512F-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-64-NEXT: vmovd %edi, %xmm0 -; AVX512F-64-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $4, %r8d, %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512F-64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512F-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512F-64-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-64-NEXT: retq -; -; AVX512BW-32-LABEL: test_buildvector_v64i8: -; AVX512BW-32: # %bb.0: -; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512BW-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512BW-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512BW-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512BW-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm2, %xmm2 -; AVX512BW-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512BW-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-32-NEXT: retl +; AVX-32-LABEL: test_buildvector_v64i8: +; AVX-32: # %bb.0: +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX-32-NEXT: retl ; -; AVX512BW-64-LABEL: test_buildvector_v64i8: -; AVX512BW-64: # %bb.0: -; AVX512BW-64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512BW-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX512BW-64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX512BW-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-64-NEXT: vmovd %edi, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $1, %esi, %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $4, %r8d, %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $5, %r9d, %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm1, %xmm1 -; AVX512BW-64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512BW-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm2, %xmm2 -; AVX512BW-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512BW-64-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-64-NEXT: retq +; AVX-64-LABEL: test_buildvector_v64i8: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVX-64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: vmovd %edi, %xmm1 +; AVX-64-NEXT: vpinsrb $1, %esi, %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $4, %r8d, %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $5, %r9d, %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm1, %xmm1 +; AVX-64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm2, %xmm2 +; AVX-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX-64-NEXT: retq %ins0 = insertelement <64 x i8> undef, i8 %a0, i32 0 %ins1 = insertelement <64 x i8> %ins0, i8 %a1, i32 1 %ins2 = insertelement <64 x i8> %ins1, i8 %a2, i32 2 diff --git a/test/CodeGen/X86/combine-sdiv.ll b/test/CodeGen/X86/combine-sdiv.ll index 156cf375354..e3384896dc1 100644 --- a/test/CodeGen/X86/combine-sdiv.ll +++ b/test/CodeGen/X86/combine-sdiv.ll @@ -933,24 +933,26 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm2 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2] ; AVX512F-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm2 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 ; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1] ; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpsravd %zmm4, %zmm2, %zmm2 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] -; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm2 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2 ; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm2 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 ; AVX512F-NEXT: vpsravd %zmm4, %zmm2, %zmm2 ; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16: diff --git a/test/CodeGen/X86/fast-isel-nontemporal.ll b/test/CodeGen/X86/fast-isel-nontemporal.ll index 37e380b2b48..7345df7ae0f 100644 --- a/test/CodeGen/X86/fast-isel-nontemporal.ll +++ b/test/CodeGen/X86/fast-isel-nontemporal.ll @@ -891,6 +891,8 @@ define void @test_nt64xi8(<64 x i8>* nocapture %ptr, <64 x i8> %X) { ; ; AVX512VL-LABEL: test_nt64xi8: ; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VL-NEXT: vmovntdq %ymm0, (%rdi) ; AVX512VL-NEXT: vmovntdq %ymm1, 32(%rdi) ; AVX512VL-NEXT: vzeroupper @@ -898,6 +900,8 @@ define void @test_nt64xi8(<64 x i8>* nocapture %ptr, <64 x i8> %X) { ; ; AVX512F-LABEL: test_nt64xi8: ; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: vmovntdq %ymm0, (%rdi) ; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi) ; AVX512F-NEXT: vzeroupper @@ -931,6 +935,8 @@ define void @test_nt32xi16(<32 x i16>* nocapture %ptr, <32 x i16> %X) { ; ; AVX512VL-LABEL: test_nt32xi16: ; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512VL-NEXT: vmovntdq %ymm0, (%rdi) ; AVX512VL-NEXT: vmovntdq %ymm1, 32(%rdi) ; AVX512VL-NEXT: vzeroupper @@ -938,6 +944,8 @@ define void @test_nt32xi16(<32 x i16>* nocapture %ptr, <32 x i16> %X) { ; ; AVX512F-LABEL: test_nt32xi16: ; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: vmovntdq %ymm0, (%rdi) ; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi) ; AVX512F-NEXT: vzeroupper @@ -1162,22 +1170,10 @@ define <64 x i8> @test_load_nt64xi8(<64 x i8>* nocapture %ptr) { ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: test_load_nt64xi8: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: retq -; -; AVX512F-LABEL: test_load_nt64xi8: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: test_load_nt64xi8: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0 -; AVX512BW-NEXT: retq +; AVX512-LABEL: test_load_nt64xi8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovntdqa (%rdi), %zmm0 +; AVX512-NEXT: retq entry: %0 = load <64 x i8>, <64 x i8>* %ptr, align 64, !nontemporal !1 ret <64 x i8> %0 @@ -1228,22 +1224,10 @@ define <32 x i16> @test_load_nt32xi16(<32 x i16>* nocapture %ptr) { ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: test_load_nt32xi16: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: retq -; -; AVX512F-LABEL: test_load_nt32xi16: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: test_load_nt32xi16: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0 -; AVX512BW-NEXT: retq +; AVX512-LABEL: test_load_nt32xi16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovntdqa (%rdi), %zmm0 +; AVX512-NEXT: retq entry: %0 = load <32 x i16>, <32 x i16>* %ptr, align 64, !nontemporal !1 ret <32 x i16> %0 diff --git a/test/CodeGen/X86/fast-isel-vecload.ll b/test/CodeGen/X86/fast-isel-vecload.ll index 31730493fb5..1cdeae96b2d 100644 --- a/test/CodeGen/X86/fast-isel-vecload.ll +++ b/test/CodeGen/X86/fast-isel-vecload.ll @@ -684,16 +684,10 @@ define <64 x i8> @test_v64i8(<64 x i8>* %V) { ; AVXONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVXONLY-NEXT: retq ; -; KNL-LABEL: test_v64i8: -; KNL: # %bb.0: # %entry -; KNL-NEXT: vmovaps (%rdi), %ymm0 -; KNL-NEXT: vmovaps 32(%rdi), %ymm1 -; KNL-NEXT: retq -; -; SKX-LABEL: test_v64i8: -; SKX: # %bb.0: # %entry -; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 -; SKX-NEXT: retq +; AVX512-LABEL: test_v64i8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: retq entry: %0 = load <64 x i8>, <64 x i8>* %V, align 64 ret <64 x i8> %0 @@ -714,16 +708,10 @@ define <32 x i16> @test_v32i16(<32 x i16>* %V) { ; AVXONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVXONLY-NEXT: retq ; -; KNL-LABEL: test_v32i16: -; KNL: # %bb.0: # %entry -; KNL-NEXT: vmovaps (%rdi), %ymm0 -; KNL-NEXT: vmovaps 32(%rdi), %ymm1 -; KNL-NEXT: retq -; -; SKX-LABEL: test_v32i16: -; SKX: # %bb.0: # %entry -; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 -; SKX-NEXT: retq +; AVX512-LABEL: test_v32i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512-NEXT: retq entry: %0 = load <32 x i16>, <32 x i16>* %V, align 64 ret <32 x i16> %0 @@ -792,16 +780,10 @@ define <64 x i8> @test_v64i8_unaligned(<64 x i8>* %V) { ; AVXONLY-NEXT: vmovups 32(%rdi), %ymm1 ; AVXONLY-NEXT: retq ; -; KNL-LABEL: test_v64i8_unaligned: -; KNL: # %bb.0: # %entry -; KNL-NEXT: vmovups (%rdi), %ymm0 -; KNL-NEXT: vmovups 32(%rdi), %ymm1 -; KNL-NEXT: retq -; -; SKX-LABEL: test_v64i8_unaligned: -; SKX: # %bb.0: # %entry -; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 -; SKX-NEXT: retq +; AVX512-LABEL: test_v64i8_unaligned: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512-NEXT: retq entry: %0 = load <64 x i8>, <64 x i8>* %V, align 4 ret <64 x i8> %0 @@ -822,16 +804,10 @@ define <32 x i16> @test_v32i16_unaligned(<32 x i16>* %V) { ; AVXONLY-NEXT: vmovups 32(%rdi), %ymm1 ; AVXONLY-NEXT: retq ; -; KNL-LABEL: test_v32i16_unaligned: -; KNL: # %bb.0: # %entry -; KNL-NEXT: vmovups (%rdi), %ymm0 -; KNL-NEXT: vmovups 32(%rdi), %ymm1 -; KNL-NEXT: retq -; -; SKX-LABEL: test_v32i16_unaligned: -; SKX: # %bb.0: # %entry -; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 -; SKX-NEXT: retq +; AVX512-LABEL: test_v32i16_unaligned: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512-NEXT: retq entry: %0 = load <32 x i16>, <32 x i16>* %V, align 4 ret <32 x i16> %0 diff --git a/test/CodeGen/X86/kshift.ll b/test/CodeGen/X86/kshift.ll index 2a91cd44a9e..2da2f184fd6 100644 --- a/test/CodeGen/X86/kshift.ll +++ b/test/CodeGen/X86/kshift.ll @@ -61,21 +61,23 @@ define i16 @kshiftl_v16i1_1(<16 x i32> %x, <16 x i32> %y) { define i32 @kshiftl_v32i1_1(<32 x i16> %x, <32 x i16> %y) { ; KNL-LABEL: kshiftl_v32i1_1: ; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; KNL-NEXT: vpcmpeqw %ymm4, %ymm1, %ymm1 -; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm3 +; KNL-NEXT: vpmovsxwd %ymm3, %zmm3 +; KNL-NEXT: vptestmd %zmm3, %zmm3, %k1 ; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[15],zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[15],zmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] ; KNL-NEXT: kshiftlw $1, %k2, %k1 -; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm1 -; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k2 -; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm1 +; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm2 +; KNL-NEXT: vpmovsxwd %ymm2, %zmm2 +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2 +; KNL-NEXT: vpcmpeqw %ymm4, %ymm1, %ymm1 ; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %ecx @@ -105,6 +107,8 @@ define i32 @kshiftl_v32i1_1(<32 x i16> %x, <32 x i16> %y) { define i64 @kshiftl_v64i1_1(<64 x i8> %x, <64 x i8> %y) { ; KNL-LABEL: kshiftl_v64i1_1: ; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm5 @@ -112,32 +116,32 @@ define i64 @kshiftl_v64i1_1(<64 x i8> %x, <64 x i8> %y) { ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm0 -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k3 +; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm0 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm3 +; KNL-NEXT: vpmovsxbd %xmm3, %zmm3 +; KNL-NEXT: vptestmd %zmm3, %zmm3, %k3 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z} -; KNL-NEXT: valignd {{.*#+}} zmm1 = zmm0[15],zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] +; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} +; KNL-NEXT: valignd {{.*#+}} zmm3 = zmm0[15],zmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] ; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z} ; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm5[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] ; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} ; KNL-NEXT: valignd {{.*#+}} zmm5 = zmm6[15],zmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14] ; KNL-NEXT: kshiftlw $1, %k1, %k3 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3 -; KNL-NEXT: vextracti128 $1, %ymm3, %xmm6 +; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2 +; KNL-NEXT: vextracti128 $1, %ymm2, %xmm6 ; KNL-NEXT: vpmovsxbd %xmm6, %zmm6 ; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1 -; KNL-NEXT: vpmovsxbd %xmm3, %zmm3 -; KNL-NEXT: vptestmd %zmm3, %zmm3, %k2 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2 -; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3 -; KNL-NEXT: vpmovsxbd %xmm3, %zmm3 -; KNL-NEXT: vptestmd %zmm3, %zmm3, %k4 ; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 {%k3} +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2 +; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1 +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k4 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k3} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: vptestmd %zmm5, %zmm5, %k0 {%k4} ; KNL-NEXT: kmovw %k0, %ecx @@ -145,7 +149,7 @@ define i64 @kshiftl_v64i1_1(<64 x i8> %x, <64 x i8> %y) { ; KNL-NEXT: orl %eax, %ecx ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k2} ; KNL-NEXT: kmovw %k0, %edx -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1} +; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: shll $16, %eax ; KNL-NEXT: orl %edx, %eax @@ -229,12 +233,13 @@ define i16 @kshiftl_v16i1_15(<16 x i32> %x, <16 x i32> %y) { define i32 @kshiftl_v32i1_31(<32 x i16> %x, <32 x i16> %y) { ; KNL-LABEL: kshiftl_v32i1_31: ; KNL: # %bb.0: -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k1 -; KNL-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm0 +; KNL-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax @@ -261,13 +266,14 @@ define i32 @kshiftl_v32i1_31(<32 x i16> %x, <32 x i16> %y) { define i64 @kshiftl_v64i1_63(<64 x i8> %x, <64 x i8> %y) { ; KNL-LABEL: kshiftl_v64i1_63: ; KNL: # %bb.0: -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k1 -; KNL-NEXT: vextracti128 $1, %ymm3, %xmm0 -; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm0 +; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax @@ -352,21 +358,23 @@ define i16 @kshiftr_v16i1_1(<16 x i32> %x, <16 x i32> %y) { define i32 @kshiftr_v32i1_1(<32 x i16> %x, <32 x i16> %y) { ; KNL-LABEL: kshiftr_v32i1_1: ; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; KNL-NEXT: vpcmpeqw %ymm4, %ymm1, %ymm1 -; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 +; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm3 +; KNL-NEXT: vpmovsxwd %ymm3, %zmm3 +; KNL-NEXT: vptestmd %zmm3, %zmm3, %k1 ; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0] +; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm3[0] ; KNL-NEXT: kshiftrw $1, %k1, %k1 -; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm1 +; KNL-NEXT: vpcmpeqw %ymm4, %ymm1, %ymm1 ; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k2 -; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm1 +; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm1 ; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %ecx @@ -396,42 +404,44 @@ define i32 @kshiftr_v32i1_1(<32 x i16> %x, <32 x i16> %y) { define i64 @kshiftr_v64i1_1(<64 x i8> %x, <64 x i8> %y) { ; KNL-LABEL: kshiftr_v64i1_1: ; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1 -; KNL-NEXT: vextracti128 $1, %ymm1, %xmm5 +; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3 +; KNL-NEXT: vextracti128 $1, %ymm3, %xmm5 ; KNL-NEXT: vpmovsxbd %xmm5, %zmm5 ; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1 ; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm5 ; KNL-NEXT: vptestmd %zmm5, %zmm5, %k2 -; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k3 +; KNL-NEXT: vpmovsxbd %xmm3, %zmm3 +; KNL-NEXT: vptestmd %zmm3, %zmm3, %k3 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z} -; KNL-NEXT: valignd {{.*#+}} zmm5 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0] +; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z} +; KNL-NEXT: valignd {{.*#+}} zmm5 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm3[0] ; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z} ; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm6[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0] ; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} -; KNL-NEXT: valignd {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm6[0] +; KNL-NEXT: valignd {{.*#+}} zmm3 = zmm3[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm6[0] ; KNL-NEXT: kshiftrw $1, %k1, %k3 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2 -; KNL-NEXT: vextracti128 $1, %ymm2, %xmm6 +; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1 +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm6 ; KNL-NEXT: vpmovsxbd %xmm6, %zmm6 ; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1 -; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2 -; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm2 -; KNL-NEXT: vpmovsxbd %xmm2, %zmm3 -; KNL-NEXT: vptestmd %zmm3, %zmm3, %k4 -; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2 -; KNL-NEXT: vpmovsxbd %xmm2, %zmm2 -; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 {%k3} +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k2 +; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm1 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm2 +; KNL-NEXT: vptestmd %zmm2, %zmm2, %k4 +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; KNL-NEXT: vpmovsxbd %xmm1, %zmm1 +; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k3} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: shll $16, %eax -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k4} +; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0 {%k4} ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: orl %eax, %ecx ; KNL-NEXT: shlq $32, %rcx @@ -520,12 +530,13 @@ define i16 @kshiftr_v16i1_15(<16 x i32> %x, <16 x i32> %y) { define i32 @kshiftr_v32i1_31(<32 x i16> %x, <32 x i16> %y) { ; KNL-LABEL: kshiftr_v32i1_31: ; KNL: # %bb.0: -; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vpcmpeqw %ymm0, %ymm1, %ymm1 -; KNL-NEXT: vpmovsxwd %ymm1, %zmm1 -; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k1 -; KNL-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0 +; KNL-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax @@ -551,13 +562,14 @@ define i32 @kshiftr_v32i1_31(<32 x i16> %x, <32 x i16> %y) { define i64 @kshiftr_v64i1_63(<64 x i8> %x, <64 x i8> %y) { ; KNL-LABEL: kshiftr_v64i1_63: ; KNL: # %bb.0: -; KNL-NEXT: vextracti128 $1, %ymm1, %xmm0 -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kshiftrw $15, %k0, %k1 -; KNL-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm0 +; KNL-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm0 ; KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll index 100a014619a..36dbb46f0b0 100644 --- a/test/CodeGen/X86/madd.ll +++ b/test/CodeGen/X86/madd.ll @@ -1975,9 +1975,11 @@ define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) { ; ; AVX512F-LABEL: pmaddwd_32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: pmaddwd_32: @@ -2186,9 +2188,11 @@ define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) { ; ; AVX512F-LABEL: jumbled_indices16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: jumbled_indices16: diff --git a/test/CodeGen/X86/masked_store_trunc.ll b/test/CodeGen/X86/masked_store_trunc.ll index a55f14dd919..5b2bf3b177c 100644 --- a/test/CodeGen/X86/masked_store_trunc.ll +++ b/test/CodeGen/X86/masked_store_trunc.ll @@ -5384,14 +5384,15 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma ; ; AVX512F-LABEL: truncstore_v32i16_v32i8: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovmskb %ymm2, %eax +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovmskb %ymm1, %eax ; AVX512F-NEXT: notl %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: jne .LBB15_1 diff --git a/test/CodeGen/X86/masked_store_trunc_ssat.ll b/test/CodeGen/X86/masked_store_trunc_ssat.ll index c0dd2893c79..777d4d14e4e 100644 --- a/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -6450,11 +6450,12 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma ; ; AVX512F-LABEL: truncstore_v32i16_v32i8: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpacksswb %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512F-NEXT: vpmovmskb %ymm2, %eax +; AVX512F-NEXT: vpmovmskb %ymm1, %eax ; AVX512F-NEXT: notl %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: jne .LBB15_1 diff --git a/test/CodeGen/X86/masked_store_trunc_usat.ll b/test/CodeGen/X86/masked_store_trunc_usat.ll index 610fcca38fe..254f0cda48f 100644 --- a/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -6195,17 +6195,18 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma ; ; AVX512F-LABEL: truncstore_v32i16_v32i8: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpminuw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpminuw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpminuw %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovmskb %ymm2, %eax +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovmskb %ymm1, %eax ; AVX512F-NEXT: notl %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: jne .LBB15_1 diff --git a/test/CodeGen/X86/merge-consecutive-loads-512.ll b/test/CodeGen/X86/merge-consecutive-loads-512.ll index eb7eff6955a..9405624265c 100644 --- a/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ b/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -463,7 +463,7 @@ define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(i16* %ptr) n ; AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vmovaps %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: @@ -475,7 +475,7 @@ define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(i16* %ptr) n ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 1 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 2 @@ -518,7 +518,7 @@ define <32 x i16> @merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu(i16* %ptr) n ; AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vmovaps %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: @@ -530,7 +530,7 @@ define <32 x i16> @merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu(i16* %ptr) n ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3 @@ -550,7 +550,7 @@ define <64 x i8> @merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu ; AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vmovaps %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: @@ -562,7 +562,7 @@ define <64 x i8> @merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2 @@ -588,7 +588,7 @@ define <64 x i8> @merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu ; AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vmovaps %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: @@ -600,7 +600,7 @@ define <64 x i8> @merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2 diff --git a/test/CodeGen/X86/midpoint-int-vec-512.ll b/test/CodeGen/X86/midpoint-int-vec-512.ll index 0403f46b016..c3743ca82a1 100644 --- a/test/CodeGen/X86/midpoint-int-vec-512.ll +++ b/test/CodeGen/X86/midpoint-int-vec-512.ll @@ -315,44 +315,50 @@ define <8 x i64> @vec512_i64_signed_mem_mem(<8 x i64>* %a1_addr, <8 x i64>* %a2_ define <32 x i16> @vec512_i16_signed_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nounwind { ; AVX512F-LABEL: vec512_i16_signed_reg_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm6 +; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm6 -; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm7 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm6 +; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm7 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsubw %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpsubw %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpsubw %ymm7, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vpmullw %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm4 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm7 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm7 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_signed_reg_reg: @@ -382,48 +388,54 @@ define <32 x i16> @vec512_i16_signed_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nou define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nounwind { ; AVX512F-LABEL: vec512_i16_unsigned_reg_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpminuw %ymm2, %ymm0, %ymm4 -; AVX512F-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpminuw %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm5 ; AVX512F-NEXT: vpternlogq $15, %zmm5, %zmm5, %zmm5 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512F-NEXT: vpminuw %ymm3, %ymm1, %ymm7 -; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm1, %ymm8 +; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm7 +; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm0, %ymm8 ; AVX512F-NEXT: vpternlogq $15, %zmm8, %zmm8, %zmm8 ; AVX512F-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512F-NEXT: vpmaxuw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpmaxuw %ymm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpsubw %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpsubw %ymm7, %ymm1, %ymm1 ; AVX512F-NEXT: vpsubw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vpmullw %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i16_unsigned_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vpminuw %ymm2, %ymm0, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpminuw %ymm2, %ymm3, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm5, %ymm5, %ymm5 ; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminuw %ymm3, %ymm1, %ymm7 -; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm7, %ymm1, %ymm8 +; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm7 +; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm7, %ymm0, %ymm8 ; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm8, %ymm8, %ymm8 ; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm2, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm3, %ymm1, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_unsigned_reg_reg: @@ -455,48 +467,52 @@ define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) n define <32 x i16> @vec512_i16_signed_mem_reg(<32 x i16>* %a1_addr, <32 x i16> %a2) nounwind { ; AVX512F-LABEL: vec512_i16_signed_mem_reg: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm4 +; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm6 +; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm6 ; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm6 -; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm7 -; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsubw %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm6 +; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm7 ; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsubw %ymm7, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpsubw %ymm7, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_reg: ; AVX512VL-FALLBACK: # %bb.0: +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm7 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm7 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_signed_mem_reg: @@ -528,48 +544,52 @@ define <32 x i16> @vec512_i16_signed_mem_reg(<32 x i16>* %a1_addr, <32 x i16> %a define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, <32 x i16>* %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i16_signed_reg_mem: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm6 +; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm6 ; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm6 -; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm7 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsubw %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm6 +; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm7 ; AVX512F-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpsubw %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpsubw %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpsubw %ymm7, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vpmullw %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4 ; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm7 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm7 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_signed_reg_mem: @@ -605,23 +625,24 @@ define <32 x i16> @vec512_i16_signed_mem_mem(<32 x i16>* %a1_addr, <32 x i16>* % ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm2 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm6 +; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm6 ; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm6 -; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm7 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsubw %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm6 +; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm7 ; AVX512F-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpsubw %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpsubw %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpsubw %ymm7, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vpmullw %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_mem: @@ -630,23 +651,24 @@ define <32 x i16> @vec512_i16_signed_mem_mem(<32 x i16>* %a1_addr, <32 x i16>* % ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm2 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4 ; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm7 -; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm7 ; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i16_signed_mem_mem: @@ -686,84 +708,90 @@ define <32 x i16> @vec512_i16_signed_mem_mem(<32 x i16>* %a1_addr, <32 x i16>* % define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_signed_reg_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm6 +; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm6 ; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm6 -; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm7 -; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpsubb %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm6 +; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm7 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpsubb %ymm7, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] ; AVX512F-NEXT: vpmullw %ymm7, %ymm6, %ymm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] -; AVX512F-NEXT: vpmullw %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpackuswb %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] ; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm4 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 ; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm7 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm7 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm6, %ymm6 ; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm6, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_reg_reg: @@ -803,88 +831,94 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_unsigned_reg_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpminub %ymm2, %ymm0, %ymm4 -; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm5 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpminub %ymm3, %ymm2, %ymm4 +; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 ; AVX512F-NEXT: vpternlogq $15, %zmm5, %zmm5, %zmm5 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512F-NEXT: vpminub %ymm3, %ymm1, %ymm7 -; AVX512F-NEXT: vpcmpeqb %ymm7, %ymm1, %ymm8 +; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm7 +; AVX512F-NEXT: vpcmpeqb %ymm7, %ymm0, %ymm8 ; AVX512F-NEXT: vpternlogq $15, %zmm8, %zmm8, %zmm8 ; AVX512F-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512F-NEXT: vpmaxub %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpmaxub %ymm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpsubb %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpmaxub %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vpsubb %ymm7, %ymm1, %ymm1 +; AVX512F-NEXT: vpsubb %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15],ymm6[24],ymm0[24],ymm6[25],ymm0[25],ymm6[26],ymm0[26],ymm6[27],ymm0[27],ymm6[28],ymm0[28],ymm6[29],ymm0[29],ymm6[30],ymm0[30],ymm6[31],ymm0[31] ; AVX512F-NEXT: vpmullw %ymm7, %ymm4, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[16],ymm0[16],ymm6[17],ymm0[17],ymm6[18],ymm0[18],ymm6[19],ymm0[19],ymm6[20],ymm0[20],ymm6[21],ymm0[21],ymm6[22],ymm0[22],ymm6[23],ymm0[23] -; AVX512F-NEXT: vpmullw %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] ; AVX512F-NEXT: vpmullw %ymm6, %ymm4, %ymm4 ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] -; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2 -; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpmullw %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_unsigned_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vpminub %ymm2, %ymm0, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm5 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminub %ymm3, %ymm2, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm5, %ymm5, %ymm5 ; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminub %ymm3, %ymm1, %ymm7 -; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm7, %ymm1, %ymm8 +; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm7 +; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm7, %ymm0, %ymm8 ; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm8, %ymm8, %ymm8 ; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm2, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm3, %ymm1, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm3, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15],ymm6[24],ymm0[24],ymm6[25],ymm0[25],ymm6[26],ymm0[26],ymm6[27],ymm0[27],ymm6[28],ymm0[28],ymm6[29],ymm0[29],ymm6[30],ymm0[30],ymm6[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm4, %ymm4 ; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[16],ymm0[16],ymm6[17],ymm0[17],ymm6[18],ymm0[18],ymm6[19],ymm0[19],ymm6[20],ymm0[20],ymm6[21],ymm0[21],ymm6[22],ymm0[22],ymm6[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm4, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_unsigned_reg_reg: @@ -926,88 +960,92 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw define <64 x i8> @vec512_i8_signed_mem_reg(<64 x i8>* %a1_addr, <64 x i8> %a2) nounwind { ; AVX512F-LABEL: vec512_i8_signed_mem_reg: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm6 +; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm6 ; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm6 -; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm7 -; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsubb %ymm6, %ymm0, %ymm0 -; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512F-NEXT: vpsubb %ymm7, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm6 +; AVX512F-NEXT: vpminsb %ymm0, %ymm1, %ymm7 +; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vpsubb %ymm7, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] ; AVX512F-NEXT: vpmullw %ymm7, %ymm6, %ymm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] -; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm7, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] ; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] -; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm7, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm4 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4 ; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm7 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm7 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0 ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm6, %ymm6 ; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm6, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_mem_reg: @@ -1049,88 +1087,92 @@ define <64 x i8> @vec512_i8_signed_mem_reg(<64 x i8>* %a1_addr, <64 x i8> %a2) n define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, <64 x i8>* %a2_addr) nounwind { ; AVX512F-LABEL: vec512_i8_signed_reg_mem: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm6 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm6 ; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm6 -; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm7 -; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm6 +; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm7 ; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpsubb %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpsubb %ymm7, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] ; AVX512F-NEXT: vpmullw %ymm7, %ymm6, %ymm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] -; AVX512F-NEXT: vpmullw %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpackuswb %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2 +; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] ; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem: ; AVX512VL-FALLBACK: # %bb.0: +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 ; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm7 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm7 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm6, %ymm6 ; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm6, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_reg_mem: @@ -1176,43 +1218,44 @@ define <64 x i8> @vec512_i8_signed_mem_mem(<64 x i8>* %a1_addr, <64 x i8>* %a2_a ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512F-NEXT: vmovdqa (%rsi), %ymm2 ; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm4 +; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm6 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm6 ; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm6 -; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm7 -; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm6 +; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm7 ; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpsubb %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpsubb %ymm7, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] ; AVX512F-NEXT: vpmullw %ymm7, %ymm6, %ymm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] -; AVX512F-NEXT: vpmullw %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpackuswb %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2 +; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] ; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] -; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_mem: @@ -1221,43 +1264,44 @@ define <64 x i8> @vec512_i8_signed_mem_mem(<64 x i8>* %a1_addr, <64 x i8>* %a2_a ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm2 ; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4 ; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm6 ; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm7 -; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm7 ; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm6, %ymm6 ; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm6, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-FALLBACK-NEXT: retq ; ; AVX512BW-LABEL: vec512_i8_signed_mem_mem: diff --git a/test/CodeGen/X86/movmsk-cmp.ll b/test/CodeGen/X86/movmsk-cmp.ll index c144ca6183d..670beca3000 100644 --- a/test/CodeGen/X86/movmsk-cmp.ll +++ b/test/CodeGen/X86/movmsk-cmp.ll @@ -224,6 +224,7 @@ define i1 @allones_v64i8_sign(<64 x i8> %arg) { ; ; KNL-LABEL: allones_v64i8_sign: ; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpmovmskb %ymm1, %eax ; KNL-NEXT: shlq $32, %rax ; KNL-NEXT: vpmovmskb %ymm0, %ecx @@ -292,6 +293,7 @@ define i1 @allzeros_v64i8_sign(<64 x i8> %arg) { ; ; KNL-LABEL: allzeros_v64i8_sign: ; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpmovmskb %ymm1, %eax ; KNL-NEXT: shlq $32, %rax ; KNL-NEXT: vpmovmskb %ymm0, %ecx @@ -545,6 +547,7 @@ define i1 @allones_v32i16_sign(<32 x i16> %arg) { ; ; KNL-LABEL: allones_v32i16_sign: ; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 @@ -612,6 +615,7 @@ define i1 @allzeros_v32i16_sign(<32 x i16> %arg) { ; ; KNL-LABEL: allzeros_v32i16_sign: ; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; KNL-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 @@ -1459,6 +1463,7 @@ define i1 @allones_v64i8_and1(<64 x i8> %arg) { ; ; KNL-LABEL: allones_v64i8_and1: ; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 ; KNL-NEXT: vpsllw $7, %ymm1, %ymm1 ; KNL-NEXT: vpmovmskb %ymm1, %eax @@ -1540,6 +1545,7 @@ define i1 @allzeros_v64i8_and1(<64 x i8> %arg) { ; ; KNL-LABEL: allzeros_v64i8_and1: ; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 ; KNL-NEXT: vpsllw $7, %ymm1, %ymm1 ; KNL-NEXT: vpmovmskb %ymm1, %eax @@ -1762,6 +1768,7 @@ define i1 @allones_v32i16_and1(<32 x i16> %arg) { ; ; KNL-LABEL: allones_v32i16_and1: ; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 @@ -1841,6 +1848,7 @@ define i1 @allzeros_v32i16_and1(<32 x i16> %arg) { ; ; KNL-LABEL: allzeros_v32i16_and1: ; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 @@ -2864,6 +2872,7 @@ define i1 @allones_v64i8_and4(<64 x i8> %arg) { ; ; KNL-LABEL: allones_v64i8_and4: ; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpsllw $5, %ymm0, %ymm0 ; KNL-NEXT: vpsllw $5, %ymm1, %ymm1 ; KNL-NEXT: vpmovmskb %ymm1, %eax @@ -2945,6 +2954,7 @@ define i1 @allzeros_v64i8_and4(<64 x i8> %arg) { ; ; KNL-LABEL: allzeros_v64i8_and4: ; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpsllw $5, %ymm0, %ymm0 ; KNL-NEXT: vpsllw $5, %ymm1, %ymm1 ; KNL-NEXT: vpmovmskb %ymm1, %eax @@ -3167,6 +3177,7 @@ define i1 @allones_v32i16_and4(<32 x i16> %arg) { ; ; KNL-LABEL: allones_v32i16_and4: ; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpsllw $13, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 @@ -3246,6 +3257,7 @@ define i1 @allzeros_v32i16_and4(<32 x i16> %arg) { ; ; KNL-LABEL: allzeros_v32i16_and4: ; KNL: # %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpsllw $13, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 ; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 diff --git a/test/CodeGen/X86/nontemporal-loads-2.ll b/test/CodeGen/X86/nontemporal-loads-2.ll index 0441d3a4922..a141db3c061 100644 --- a/test/CodeGen/X86/nontemporal-loads-2.ll +++ b/test/CodeGen/X86/nontemporal-loads-2.ll @@ -511,16 +511,10 @@ define <32 x i16> @test_v32i16_align1(<32 x i16>* %src) nounwind { ; AVX2-NEXT: vmovups 32(%rdi), %ymm1 ; AVX2-NEXT: retq ; -; AVX512DQ-LABEL: test_v32i16_align1: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovups (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovups 32(%rdi), %ymm1 -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: test_v32i16_align1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovups (%rdi), %zmm0 -; AVX512BW-NEXT: retq +; AVX512-LABEL: test_v32i16_align1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovups (%rdi), %zmm0 +; AVX512-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %src, align 1, !nontemporal !1 ret <32 x i16> %1 } @@ -546,16 +540,10 @@ define <64 x i8> @test_v64i8_align1(<64 x i8>* %src) nounwind { ; AVX2-NEXT: vmovups 32(%rdi), %ymm1 ; AVX2-NEXT: retq ; -; AVX512DQ-LABEL: test_v64i8_align1: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovups (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovups 32(%rdi), %ymm1 -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: test_v64i8_align1: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovups (%rdi), %zmm0 -; AVX512BW-NEXT: retq +; AVX512-LABEL: test_v64i8_align1: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovups (%rdi), %zmm0 +; AVX512-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %src, align 1, !nontemporal !1 ret <64 x i8> %1 } @@ -948,7 +936,7 @@ define <32 x i16> @test_v32i16_align16(<32 x i16>* %src) nounwind { ; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX512DQ-NEXT: vmovaps (%rsp), %ymm0 -; AVX512DQ-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX512DQ-NEXT: vinsertf64x4 $1, {{[0-9]+}}(%rsp), %zmm0, %zmm0 ; AVX512DQ-NEXT: movq %rbp, %rsp ; AVX512DQ-NEXT: popq %rbp ; AVX512DQ-NEXT: retq @@ -1047,7 +1035,7 @@ define <64 x i8> @test_v64i8_align16(<64 x i8>* %src) nounwind { ; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %xmm0 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp) ; AVX512DQ-NEXT: vmovaps (%rsp), %ymm0 -; AVX512DQ-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX512DQ-NEXT: vinsertf64x4 $1, {{[0-9]+}}(%rsp), %zmm0, %zmm0 ; AVX512DQ-NEXT: movq %rbp, %rsp ; AVX512DQ-NEXT: popq %rbp ; AVX512DQ-NEXT: retq @@ -1315,6 +1303,7 @@ define <32 x i16> @test_v32i16_align32(<32 x i16>* %src) nounwind { ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: test_v32i16_align32: @@ -1372,6 +1361,7 @@ define <64 x i8> @test_v64i8_align32(<64 x i8>* %src) nounwind { ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: test_v64i8_align32: diff --git a/test/CodeGen/X86/nontemporal-loads.ll b/test/CodeGen/X86/nontemporal-loads.ll index 8af4a680c77..5dfdb917a5d 100644 --- a/test/CodeGen/X86/nontemporal-loads.ll +++ b/test/CodeGen/X86/nontemporal-loads.ll @@ -550,22 +550,10 @@ define <32 x i16> @test_v32i16(<32 x i16>* %src) { ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test_v32i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: test_v32i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v32i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovntdqa (%rdi), %zmm0 +; AVX512-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %src, align 64, !nontemporal !1 ret <32 x i16> %1 } @@ -603,22 +591,10 @@ define <64 x i8> @test_v64i8(<64 x i8>* %src) { ; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test_v64i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: test_v64i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v64i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovntdqa (%rdi), %zmm0 +; AVX512-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %src, align 64, !nontemporal !1 ret <64 x i8> %1 } @@ -1299,10 +1275,12 @@ define <32 x i16> @test_arg_v32i16(<32 x i16> %arg, <32 x i16>* %src) { ; ; AVX512F-LABEL: test_arg_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vmovntdqa (%rdi), %ymm3 -; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vmovntdqa (%rdi), %ymm2 +; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm3 +; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_arg_v32i16: @@ -1313,10 +1291,12 @@ define <32 x i16> @test_arg_v32i16(<32 x i16> %arg, <32 x i16>* %src) { ; ; AVX512VL-LABEL: test_arg_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm2 -; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm3 -; AVX512VL-NEXT: vpaddw %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm2 +; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm3 +; AVX512VL-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %src, align 64, !nontemporal !1 %2 = add <32 x i16> %arg, %1 @@ -1370,10 +1350,12 @@ define <64 x i8> @test_arg_v64i8(<64 x i8> %arg, <64 x i8>* %src) { ; ; AVX512F-LABEL: test_arg_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vmovntdqa (%rdi), %ymm3 -; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vmovntdqa (%rdi), %ymm2 +; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm3 +; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_arg_v64i8: @@ -1384,10 +1366,12 @@ define <64 x i8> @test_arg_v64i8(<64 x i8> %arg, <64 x i8>* %src) { ; ; AVX512VL-LABEL: test_arg_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm2 -; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm3 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm2 +; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm3 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %src, align 64, !nontemporal !1 %2 = add <64 x i8> %arg, %1 @@ -1742,22 +1726,10 @@ define <32 x i16> @test_unaligned_v32i16(<32 x i16>* %src) { ; AVX-NEXT: vmovups 32(%rdi), %ymm1 ; AVX-NEXT: retq ; -; AVX512F-LABEL: test_unaligned_v32i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovups (%rdi), %ymm0 -; AVX512F-NEXT: vmovups 32(%rdi), %ymm1 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: test_unaligned_v32i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovups (%rdi), %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_unaligned_v32i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovups (%rdi), %ymm0 -; AVX512VL-NEXT: vmovups 32(%rdi), %ymm1 -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_unaligned_v32i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovups (%rdi), %zmm0 +; AVX512-NEXT: retq %1 = load <32 x i16>, <32 x i16>* %src, align 1, !nontemporal !1 ret <32 x i16> %1 } @@ -1777,22 +1749,10 @@ define <64 x i8> @test_unaligned_v64i8(<64 x i8>* %src) { ; AVX-NEXT: vmovups 32(%rdi), %ymm1 ; AVX-NEXT: retq ; -; AVX512F-LABEL: test_unaligned_v64i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovups (%rdi), %ymm0 -; AVX512F-NEXT: vmovups 32(%rdi), %ymm1 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: test_unaligned_v64i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovups (%rdi), %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_unaligned_v64i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovups (%rdi), %ymm0 -; AVX512VL-NEXT: vmovups 32(%rdi), %ymm1 -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_unaligned_v64i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovups (%rdi), %zmm0 +; AVX512-NEXT: retq %1 = load <64 x i8>, <64 x i8>* %src, align 1, !nontemporal !1 ret <64 x i8> %1 } diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index fba90d2d41d..3b856314390 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -818,22 +818,24 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { ; ; AVX512F-LABEL: mul_v64i8c: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_v64i8c: @@ -973,25 +975,28 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { ; ; AVX512F-LABEL: mul_v64i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] +; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mul_v64i8: diff --git a/test/CodeGen/X86/pmulh.ll b/test/CodeGen/X86/pmulh.ll index eef26dfa8c2..7281e4c2c18 100644 --- a/test/CodeGen/X86/pmulh.ll +++ b/test/CodeGen/X86/pmulh.ll @@ -132,8 +132,11 @@ define <32 x i16> @mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) { ; ; AVX512F-LABEL: mulhuw_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpmulhuw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mulhuw_v32i16: @@ -165,8 +168,11 @@ define <32 x i16> @mulhw_v32i16(<32 x i16> %a, <32 x i16> %b) { ; ; AVX512F-LABEL: mulhw_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpmulhw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpmulhw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mulhw_v32i16: diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll index d2498f35572..ebb3b623c46 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -650,6 +650,7 @@ define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) { ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 @@ -666,6 +667,7 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ ; ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 @@ -737,6 +739,7 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) { ; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 @@ -752,6 +755,7 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ ; ; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 diff --git a/test/CodeGen/X86/subvector-broadcast.ll b/test/CodeGen/X86/subvector-broadcast.ll index 7ecfac5151f..567c438bd5b 100644 --- a/test/CodeGen/X86/subvector-broadcast.ll +++ b/test/CodeGen/X86/subvector-broadcast.ll @@ -367,25 +367,11 @@ define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind { ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: test_broadcast_8i16_32i16: -; X32-AVX512F: # %bb.0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X32-AVX512F-NEXT: vmovdqa %ymm0, %ymm1 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_8i16_32i16: -; X32-AVX512BW: # %bb.0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_8i16_32i16: -; X32-AVX512DQ: # %bb.0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X32-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: test_broadcast_8i16_32i16: +; X32-AVX512: # %bb.0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_8i16_32i16: ; X64-AVX: # %bb.0: @@ -393,22 +379,10 @@ define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind { ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: test_broadcast_8i16_32i16: -; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX512F-NEXT: vmovdqa %ymm0, %ymm1 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_8i16_32i16: -; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: test_broadcast_8i16_32i16: -; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_8i16_32i16: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: retq %1 = load <8 x i16>, <8 x i16> *%p %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> ret <32 x i16> %2 @@ -422,25 +396,11 @@ define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: test_broadcast_16i16_32i16: -; X32-AVX512F: # %bb.0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovaps (%eax), %ymm0 -; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_16i16_32i16: -; X32-AVX512BW: # %bb.0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_16i16_32i16: -; X32-AVX512DQ: # %bb.0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0 -; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: test_broadcast_16i16_32i16: +; X32-AVX512: # %bb.0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_16i16_32i16: ; X64-AVX: # %bb.0: @@ -448,22 +408,10 @@ define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: test_broadcast_16i16_32i16: -; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_16i16_32i16: -; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: test_broadcast_16i16_32i16: -; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_16i16_32i16: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: retq %1 = load <16 x i16>, <16 x i16> *%p %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> ret <32 x i16> %2 @@ -504,25 +452,11 @@ define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind { ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: test_broadcast_16i8_64i8: -; X32-AVX512F: # %bb.0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X32-AVX512F-NEXT: vmovdqa %ymm0, %ymm1 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_16i8_64i8: -; X32-AVX512BW: # %bb.0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_16i8_64i8: -; X32-AVX512DQ: # %bb.0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X32-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: test_broadcast_16i8_64i8: +; X32-AVX512: # %bb.0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_16i8_64i8: ; X64-AVX: # %bb.0: @@ -530,22 +464,10 @@ define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind { ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: test_broadcast_16i8_64i8: -; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX512F-NEXT: vmovdqa %ymm0, %ymm1 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_16i8_64i8: -; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: test_broadcast_16i8_64i8: -; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] -; X64-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_16i8_64i8: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: retq %1 = load <16 x i8>, <16 x i8> *%p %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> ret <64 x i8> %2 @@ -559,25 +481,11 @@ define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind { ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: test_broadcast_32i8_64i8: -; X32-AVX512F: # %bb.0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovaps (%eax), %ymm0 -; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_32i8_64i8: -; X32-AVX512BW: # %bb.0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_32i8_64i8: -; X32-AVX512DQ: # %bb.0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0 -; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: test_broadcast_32i8_64i8: +; X32-AVX512: # %bb.0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_32i8_64i8: ; X64-AVX: # %bb.0: @@ -585,22 +493,10 @@ define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind { ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: test_broadcast_32i8_64i8: -; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_32i8_64i8: -; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: test_broadcast_32i8_64i8: -; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 -; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_32i8_64i8: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: retq %1 = load <32 x i8>, <32 x i8> *%p %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> ret <64 x i8> %2 @@ -1332,26 +1228,12 @@ define <32 x i16> @reg_broadcast_8i16_32i16(<8 x i16> %a0) nounwind { ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: reg_broadcast_8i16_32i16: -; X32-AVX512F: # %bb.0: -; X32-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: reg_broadcast_8i16_32i16: -; X32-AVX512BW: # %bb.0: -; X32-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: reg_broadcast_8i16_32i16: -; X32-AVX512DQ: # %bb.0: -; X32-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: reg_broadcast_8i16_32i16: +; X32-AVX512: # %bb.0: +; X32-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: reg_broadcast_8i16_32i16: ; X64-AVX: # %bb.0: @@ -1360,26 +1242,12 @@ define <32 x i16> @reg_broadcast_8i16_32i16(<8 x i16> %a0) nounwind { ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: reg_broadcast_8i16_32i16: -; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: reg_broadcast_8i16_32i16: -; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: reg_broadcast_8i16_32i16: -; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: reg_broadcast_8i16_32i16: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <32 x i32> ret <32 x i16> %1 } @@ -1390,42 +1258,22 @@ define <32 x i16> @reg_broadcast_16i16_32i16(<16 x i16> %a0) nounwind { ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: reg_broadcast_16i16_32i16: -; X32-AVX512F: # %bb.0: -; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: reg_broadcast_16i16_32i16: -; X32-AVX512BW: # %bb.0: -; X32-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: reg_broadcast_16i16_32i16: -; X32-AVX512DQ: # %bb.0: -; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: reg_broadcast_16i16_32i16: +; X32-AVX512: # %bb.0: +; X32-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: reg_broadcast_16i16_32i16: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: reg_broadcast_16i16_32i16: -; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: reg_broadcast_16i16_32i16: -; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: reg_broadcast_16i16_32i16: -; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: reg_broadcast_16i16_32i16: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <32 x i32> ret <32 x i16> %1 } @@ -1454,26 +1302,12 @@ define <64 x i8> @reg_broadcast_16i8_64i8(<16 x i8> %a0) nounwind { ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: reg_broadcast_16i8_64i8: -; X32-AVX512F: # %bb.0: -; X32-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: reg_broadcast_16i8_64i8: -; X32-AVX512BW: # %bb.0: -; X32-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: reg_broadcast_16i8_64i8: -; X32-AVX512DQ: # %bb.0: -; X32-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: reg_broadcast_16i8_64i8: +; X32-AVX512: # %bb.0: +; X32-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: reg_broadcast_16i8_64i8: ; X64-AVX: # %bb.0: @@ -1482,26 +1316,12 @@ define <64 x i8> @reg_broadcast_16i8_64i8(<16 x i8> %a0) nounwind { ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: reg_broadcast_16i8_64i8: -; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: reg_broadcast_16i8_64i8: -; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: reg_broadcast_16i8_64i8: -; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: reg_broadcast_16i8_64i8: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <64 x i32> ret <64 x i8> %1 } @@ -1512,42 +1332,22 @@ define <64 x i8> @reg_broadcast_32i8_64i8(<32 x i8> %a0) nounwind { ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: reg_broadcast_32i8_64i8: -; X32-AVX512F: # %bb.0: -; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1 -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: reg_broadcast_32i8_64i8: -; X32-AVX512BW: # %bb.0: -; X32-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: reg_broadcast_32i8_64i8: -; X32-AVX512DQ: # %bb.0: -; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: reg_broadcast_32i8_64i8: +; X32-AVX512: # %bb.0: +; X32-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: reg_broadcast_32i8_64i8: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: reg_broadcast_32i8_64i8: -; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: reg_broadcast_32i8_64i8: -; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: reg_broadcast_32i8_64i8: -; X64-AVX512DQ: # %bb.0: -; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: reg_broadcast_32i8_64i8: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> ret <64 x i8> %1 } diff --git a/test/CodeGen/X86/var-permute-512.ll b/test/CodeGen/X86/var-permute-512.ll index 3f9f96b008c..a39a1ee1598 100644 --- a/test/CodeGen/X86/var-permute-512.ll +++ b/test/CodeGen/X86/var-permute-512.ll @@ -98,174 +98,176 @@ define <32 x i16> @var_shuffle_v32i16(<32 x i16> %v, <32 x i16> %indices) nounwi ; NOBW-NEXT: movq %rsp, %rbp ; NOBW-NEXT: andq $-64, %rsp ; NOBW-NEXT: subq $2112, %rsp # imm = 0x840 +; NOBW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; NOBW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; NOBW-NEXT: vextracti128 $1, %ymm2, %xmm4 ; NOBW-NEXT: vmovd %xmm4, %eax -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vmovaps %ymm0, (%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: andl $31, %eax +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm0, (%rsp) -; NOBW-NEXT: movzwl 1472(%rsp,%rax,2), %eax +; NOBW-NEXT: movzwl 1536(%rsp,%rax,2), %eax ; NOBW-NEXT: vmovd %eax, %xmm0 ; NOBW-NEXT: vpextrw $1, %xmm4, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $1, 1408(%rsp,%rax,2), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrw $1, 1600(%rsp,%rax,2), %xmm0, %xmm0 ; NOBW-NEXT: vpextrw $2, %xmm4, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $2, 1344(%rsp,%rax,2), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrw $2, 1664(%rsp,%rax,2), %xmm0, %xmm0 ; NOBW-NEXT: vpextrw $3, %xmm4, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $3, 1280(%rsp,%rax,2), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrw $3, 1728(%rsp,%rax,2), %xmm0, %xmm0 ; NOBW-NEXT: vpextrw $4, %xmm4, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $4, 1216(%rsp,%rax,2), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrw $4, 1792(%rsp,%rax,2), %xmm0, %xmm0 ; NOBW-NEXT: vpextrw $5, %xmm4, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $5, 1152(%rsp,%rax,2), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrw $5, 1856(%rsp,%rax,2), %xmm0, %xmm0 ; NOBW-NEXT: vpextrw $6, %xmm4, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $6, 1088(%rsp,%rax,2), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrw $6, 1920(%rsp,%rax,2), %xmm0, %xmm0 ; NOBW-NEXT: vpextrw $7, %xmm4, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $7, 1024(%rsp,%rax,2), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrw $7, 1984(%rsp,%rax,2), %xmm0, %xmm0 ; NOBW-NEXT: vmovd %xmm2, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: movzwl 1984(%rsp,%rax,2), %eax -; NOBW-NEXT: vmovd %eax, %xmm1 -; NOBW-NEXT: vpextrw $1, %xmm2, %eax -; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $1, 1920(%rsp,%rax,2), %xmm1, %xmm1 -; NOBW-NEXT: vpextrw $2, %xmm2, %eax -; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $2, 1856(%rsp,%rax,2), %xmm1, %xmm1 -; NOBW-NEXT: vpextrw $3, %xmm2, %eax -; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $3, 1792(%rsp,%rax,2), %xmm1, %xmm1 -; NOBW-NEXT: vpextrw $4, %xmm2, %eax -; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $4, 1728(%rsp,%rax,2), %xmm1, %xmm1 -; NOBW-NEXT: vpextrw $5, %xmm2, %eax -; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $5, 1664(%rsp,%rax,2), %xmm1, %xmm1 -; NOBW-NEXT: vpextrw $6, %xmm2, %eax -; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $6, 1600(%rsp,%rax,2), %xmm1, %xmm1 -; NOBW-NEXT: vpextrw $7, %xmm2, %eax -; NOBW-NEXT: vextracti128 $1, %ymm3, %xmm2 -; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $7, 1536(%rsp,%rax,2), %xmm1, %xmm1 -; NOBW-NEXT: vmovd %xmm2, %eax -; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: movzwl 448(%rsp,%rax,2), %eax +; NOBW-NEXT: movzwl 1024(%rsp,%rax,2), %eax ; NOBW-NEXT: vmovd %eax, %xmm4 ; NOBW-NEXT: vpextrw $1, %xmm2, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $1, 384(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrw $1, 1088(%rsp,%rax,2), %xmm4, %xmm4 ; NOBW-NEXT: vpextrw $2, %xmm2, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $2, 320(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrw $2, 1152(%rsp,%rax,2), %xmm4, %xmm4 ; NOBW-NEXT: vpextrw $3, %xmm2, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $3, 256(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrw $3, 1216(%rsp,%rax,2), %xmm4, %xmm4 ; NOBW-NEXT: vpextrw $4, %xmm2, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $4, 192(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrw $4, 1280(%rsp,%rax,2), %xmm4, %xmm4 ; NOBW-NEXT: vpextrw $5, %xmm2, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $5, 128(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrw $5, 1344(%rsp,%rax,2), %xmm4, %xmm4 ; NOBW-NEXT: vpextrw $6, %xmm2, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $6, 64(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrw $6, 1408(%rsp,%rax,2), %xmm4, %xmm4 ; NOBW-NEXT: vpextrw $7, %xmm2, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm4, %xmm2 +; NOBW-NEXT: vpinsrw $7, 1472(%rsp,%rax,2), %xmm4, %xmm2 ; NOBW-NEXT: vmovd %xmm3, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: movzwl 960(%rsp,%rax,2), %eax +; NOBW-NEXT: movzwl 512(%rsp,%rax,2), %eax ; NOBW-NEXT: vmovd %eax, %xmm4 ; NOBW-NEXT: vpextrw $1, %xmm3, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $1, 896(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrw $1, 576(%rsp,%rax,2), %xmm4, %xmm4 ; NOBW-NEXT: vpextrw $2, %xmm3, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $2, 832(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrw $2, 640(%rsp,%rax,2), %xmm4, %xmm4 ; NOBW-NEXT: vpextrw $3, %xmm3, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $3, 768(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrw $3, 704(%rsp,%rax,2), %xmm4, %xmm4 ; NOBW-NEXT: vpextrw $4, %xmm3, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $4, 704(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrw $4, 768(%rsp,%rax,2), %xmm4, %xmm4 ; NOBW-NEXT: vpextrw $5, %xmm3, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $5, 640(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrw $5, 832(%rsp,%rax,2), %xmm4, %xmm4 ; NOBW-NEXT: vpextrw $6, %xmm3, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $6, 576(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrw $6, 896(%rsp,%rax,2), %xmm4, %xmm4 ; NOBW-NEXT: vpextrw $7, %xmm3, %eax ; NOBW-NEXT: andl $31, %eax -; NOBW-NEXT: vpinsrw $7, 512(%rsp,%rax,2), %xmm4, %xmm3 -; NOBW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; NOBW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 +; NOBW-NEXT: vpinsrw $7, 960(%rsp,%rax,2), %xmm4, %xmm3 +; NOBW-NEXT: vmovd %xmm1, %eax +; NOBW-NEXT: andl $31, %eax +; NOBW-NEXT: movzwl (%rsp,%rax,2), %eax +; NOBW-NEXT: vmovd %eax, %xmm4 +; NOBW-NEXT: vpextrw $1, %xmm1, %eax +; NOBW-NEXT: andl $31, %eax +; NOBW-NEXT: vpinsrw $1, 64(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpextrw $2, %xmm1, %eax +; NOBW-NEXT: andl $31, %eax +; NOBW-NEXT: vpinsrw $2, 128(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpextrw $3, %xmm1, %eax +; NOBW-NEXT: andl $31, %eax +; NOBW-NEXT: vpinsrw $3, 192(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpextrw $4, %xmm1, %eax +; NOBW-NEXT: andl $31, %eax +; NOBW-NEXT: vpinsrw $4, 256(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpextrw $5, %xmm1, %eax +; NOBW-NEXT: andl $31, %eax +; NOBW-NEXT: vpinsrw $5, 320(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpextrw $6, %xmm1, %eax +; NOBW-NEXT: andl $31, %eax +; NOBW-NEXT: vpinsrw $6, 384(%rsp,%rax,2), %xmm4, %xmm4 +; NOBW-NEXT: vpextrw $7, %xmm1, %eax +; NOBW-NEXT: andl $31, %eax +; NOBW-NEXT: vpinsrw $7, 448(%rsp,%rax,2), %xmm4, %xmm1 +; NOBW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; NOBW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; NOBW-NEXT: movq %rbp, %rsp ; NOBW-NEXT: popq %rbp ; NOBW-NEXT: retq @@ -380,334 +382,336 @@ define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind { ; NOBW-NEXT: movq %rsp, %rbp ; NOBW-NEXT: andq $-64, %rsp ; NOBW-NEXT: subq $4160, %rsp # imm = 0x1040 +; NOBW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; NOBW-NEXT: vextracti128 $1, %ymm1, %xmm3 ; NOBW-NEXT: vextracti128 $1, %ymm2, %xmm4 ; NOBW-NEXT: vpextrb $0, %xmm4, %eax -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vmovaps %ymm0, (%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) +; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp) ; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) -; NOBW-NEXT: vmovaps %ymm0, (%rsp) -; NOBW-NEXT: movzbl 3008(%rsp,%rax), %eax +; NOBW-NEXT: movzbl 3072(%rsp,%rax), %eax ; NOBW-NEXT: vmovd %eax, %xmm0 ; NOBW-NEXT: vpextrb $1, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $1, 2944(%rsp,%rax), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $1, 3136(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $2, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $2, 2880(%rsp,%rax), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $2, 3200(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $3, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $3, 2816(%rsp,%rax), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $3, 3264(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $4, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $4, 2752(%rsp,%rax), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $4, 3328(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $5, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $5, 2688(%rsp,%rax), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $5, 3392(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $6, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $6, 2624(%rsp,%rax), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $6, 3456(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $7, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $7, 2560(%rsp,%rax), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $7, 3520(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $8, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $8, 2496(%rsp,%rax), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $8, 3584(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $9, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $9, 2432(%rsp,%rax), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $9, 3648(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $10, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $10, 2368(%rsp,%rax), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $10, 3712(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $11, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $11, 2304(%rsp,%rax), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $11, 3776(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $12, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $12, 2240(%rsp,%rax), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $12, 3840(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $13, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $13, 2176(%rsp,%rax), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $13, 3904(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $14, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $14, 2112(%rsp,%rax), %xmm0, %xmm0 +; NOBW-NEXT: vpinsrb $14, 3968(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $15, %xmm4, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $15, 2048(%rsp,%rax), %xmm0, %xmm0 -; NOBW-NEXT: vpextrb $0, %xmm2, %eax -; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: movzbl 4032(%rsp,%rax), %eax -; NOBW-NEXT: vmovd %eax, %xmm1 -; NOBW-NEXT: vpextrb $1, %xmm2, %eax -; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $1, 3968(%rsp,%rax), %xmm1, %xmm1 -; NOBW-NEXT: vpextrb $2, %xmm2, %eax -; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $2, 3904(%rsp,%rax), %xmm1, %xmm1 -; NOBW-NEXT: vpextrb $3, %xmm2, %eax -; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $3, 3840(%rsp,%rax), %xmm1, %xmm1 -; NOBW-NEXT: vpextrb $4, %xmm2, %eax -; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $4, 3776(%rsp,%rax), %xmm1, %xmm1 -; NOBW-NEXT: vpextrb $5, %xmm2, %eax -; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $5, 3712(%rsp,%rax), %xmm1, %xmm1 -; NOBW-NEXT: vpextrb $6, %xmm2, %eax -; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $6, 3648(%rsp,%rax), %xmm1, %xmm1 -; NOBW-NEXT: vpextrb $7, %xmm2, %eax -; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $7, 3584(%rsp,%rax), %xmm1, %xmm1 -; NOBW-NEXT: vpextrb $8, %xmm2, %eax -; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $8, 3520(%rsp,%rax), %xmm1, %xmm1 -; NOBW-NEXT: vpextrb $9, %xmm2, %eax -; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $9, 3456(%rsp,%rax), %xmm1, %xmm1 -; NOBW-NEXT: vpextrb $10, %xmm2, %eax -; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $10, 3392(%rsp,%rax), %xmm1, %xmm1 -; NOBW-NEXT: vpextrb $11, %xmm2, %eax -; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $11, 3328(%rsp,%rax), %xmm1, %xmm1 -; NOBW-NEXT: vpextrb $12, %xmm2, %eax -; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $12, 3264(%rsp,%rax), %xmm1, %xmm1 -; NOBW-NEXT: vpextrb $13, %xmm2, %eax -; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $13, 3200(%rsp,%rax), %xmm1, %xmm1 -; NOBW-NEXT: vpextrb $14, %xmm2, %eax -; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $14, 3136(%rsp,%rax), %xmm1, %xmm1 -; NOBW-NEXT: vpextrb $15, %xmm2, %eax -; NOBW-NEXT: vextracti128 $1, %ymm3, %xmm2 -; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $15, 3072(%rsp,%rax), %xmm1, %xmm1 +; NOBW-NEXT: vpinsrb $15, 4032(%rsp,%rax), %xmm0, %xmm0 ; NOBW-NEXT: vpextrb $0, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: movzbl 960(%rsp,%rax), %eax +; NOBW-NEXT: movzbl 2048(%rsp,%rax), %eax ; NOBW-NEXT: vmovd %eax, %xmm4 ; NOBW-NEXT: vpextrb $1, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $1, 896(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $1, 2112(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $2, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $2, 832(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $2, 2176(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $3, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $3, 768(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $3, 2240(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $4, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $4, 704(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $4, 2304(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $5, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $5, 640(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $5, 2368(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $6, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $6, 576(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $6, 2432(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $7, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $7, 512(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $7, 2496(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $8, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $8, 448(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $8, 2560(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $9, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $9, 384(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $9, 2624(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $10, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $10, 320(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $10, 2688(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $11, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $11, 256(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $11, 2752(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $12, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $12, 192(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $12, 2816(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $13, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $13, 128(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $13, 2880(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $14, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $14, 64(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $14, 2944(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $15, %xmm2, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $15, (%rsp,%rax), %xmm4, %xmm2 +; NOBW-NEXT: vpinsrb $15, 3008(%rsp,%rax), %xmm4, %xmm2 ; NOBW-NEXT: vpextrb $0, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: movzbl 1984(%rsp,%rax), %eax +; NOBW-NEXT: movzbl 1024(%rsp,%rax), %eax ; NOBW-NEXT: vmovd %eax, %xmm4 ; NOBW-NEXT: vpextrb $1, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $1, 1920(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $1, 1088(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $2, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $2, 1856(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $2, 1152(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $3, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $3, 1792(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $3, 1216(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $4, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $4, 1728(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $4, 1280(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $5, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $5, 1664(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $5, 1344(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $6, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $6, 1600(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $6, 1408(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $7, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $7, 1536(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $7, 1472(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $8, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $8, 1472(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $8, 1536(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $9, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $9, 1408(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $9, 1600(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $10, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $10, 1344(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $10, 1664(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $11, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $11, 1280(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $11, 1728(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $12, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $12, 1216(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $12, 1792(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $13, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $13, 1152(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $13, 1856(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $14, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $14, 1088(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpinsrb $14, 1920(%rsp,%rax), %xmm4, %xmm4 ; NOBW-NEXT: vpextrb $15, %xmm3, %eax ; NOBW-NEXT: andl $63, %eax -; NOBW-NEXT: vpinsrb $15, 1024(%rsp,%rax), %xmm4, %xmm3 -; NOBW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; NOBW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 +; NOBW-NEXT: vpinsrb $15, 1984(%rsp,%rax), %xmm4, %xmm3 +; NOBW-NEXT: vpextrb $0, %xmm1, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: movzbl (%rsp,%rax), %eax +; NOBW-NEXT: vmovd %eax, %xmm4 +; NOBW-NEXT: vpextrb $1, %xmm1, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $1, 64(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpextrb $2, %xmm1, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $2, 128(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpextrb $3, %xmm1, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $3, 192(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpextrb $4, %xmm1, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $4, 256(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpextrb $5, %xmm1, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $5, 320(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpextrb $6, %xmm1, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $6, 384(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpextrb $7, %xmm1, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $7, 448(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpextrb $8, %xmm1, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $8, 512(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpextrb $9, %xmm1, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $9, 576(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpextrb $10, %xmm1, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $10, 640(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpextrb $11, %xmm1, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $11, 704(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpextrb $12, %xmm1, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $12, 768(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpextrb $13, %xmm1, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $13, 832(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpextrb $14, %xmm1, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $14, 896(%rsp,%rax), %xmm4, %xmm4 +; NOBW-NEXT: vpextrb $15, %xmm1, %eax +; NOBW-NEXT: andl $63, %eax +; NOBW-NEXT: vpinsrb $15, 960(%rsp,%rax), %xmm4, %xmm1 +; NOBW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; NOBW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; NOBW-NEXT: movq %rbp, %rsp ; NOBW-NEXT: popq %rbp ; NOBW-NEXT: retq diff --git a/test/CodeGen/X86/vec_shift6.ll b/test/CodeGen/X86/vec_shift6.ll index 1415d8549ae..214a4c8598d 100644 --- a/test/CodeGen/X86/vec_shift6.ll +++ b/test/CodeGen/X86/vec_shift6.ll @@ -162,13 +162,23 @@ define <32 x i16> @test7(<32 x i16> %a) { ; SSE-NEXT: pmullw %xmm4, %xmm3 ; SSE-NEXT: retq ; -; AVX-LABEL: test7: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] -; AVX-NEXT: # ymm2 = mem[0,1,0,1] -; AVX-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX-NEXT: retq +; AVX2-LABEL: test7: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] +; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test7: +; AVX512: # %bb.0: +; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] +; AVX512-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: retq %shl = shl <32 x i16> %a, ret <32 x i16> %shl } diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll index e152785b08f..d28af7ad751 100644 --- a/test/CodeGen/X86/vector-bitreverse.ll +++ b/test/CodeGen/X86/vector-bitreverse.ll @@ -1323,21 +1323,23 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind { ; ; AVX512F-LABEL: test_bitreverse_v64i8: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX512F-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX512F-NEXT: vpshufb %ymm1, %ymm5, %ymm1 ; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_bitreverse_v64i8: @@ -1613,24 +1615,26 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; ; AVX512F-LABEL: test_bitreverse_v32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm4 +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm2 -; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] ; AVX512F-NEXT: vpshufb %ymm1, %ymm6, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2 +; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_bitreverse_v32i16: diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll index a01249d64b3..2d9fb798267 100644 --- a/test/CodeGen/X86/vector-compare-results.ll +++ b/test/CodeGen/X86/vector-compare-results.ll @@ -732,22 +732,26 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind { ; ; AVX512F-LABEL: test_cmp_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_cmp_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQ-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: retq @@ -823,24 +827,26 @@ define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind { ; ; AVX512F-LABEL: test_cmp_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vmovdqa %xmm4, %xmm1 ; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_cmp_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQ-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm1 ; AVX512DQ-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq diff --git a/test/CodeGen/X86/vector-fshl-512.ll b/test/CodeGen/X86/vector-fshl-512.ll index 0e07ddb4b64..9f2cad9f954 100644 --- a/test/CodeGen/X86/vector-fshl-512.ll +++ b/test/CodeGen/X86/vector-fshl-512.ll @@ -150,64 +150,72 @@ define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> % define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512F-NEXT: vpsllvd %zmm7, %zmm8, %zmm7 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %ymm4, %ymm8, %ymm9 +; AVX512F-NEXT: vpsubw %ymm5, %ymm8, %ymm9 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpsrlvd %zmm9, %zmm2, %zmm2 -; AVX512F-NEXT: vpord %zmm2, %zmm7, %zmm2 -; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512F-NEXT: vpsrlvd %zmm9, %zmm3, %zmm3 +; AVX512F-NEXT: vpord %zmm3, %zmm7, %zmm3 +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 ; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm4, %ymm4 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm2 +; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpsllvd %zmm4, %zmm5, %zmm4 ; AVX512F-NEXT: vpsubw %ymm2, %ymm8, %ymm5 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512F-NEXT: vpsrlvd %zmm5, %zmm3, %zmm3 -; AVX512F-NEXT: vpord %zmm3, %zmm4, %zmm3 -; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpsrlvd %zmm5, %zmm1, %zmm1 +; AVX512F-NEXT: vpord %zmm1, %zmm4, %zmm1 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v32i16: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512VL-NEXT: vpsllvd %zmm7, %zmm8, %zmm7 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %ymm4, %ymm8, %ymm9 +; AVX512VL-NEXT: vpsubw %ymm5, %ymm8, %ymm9 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm9, %zmm2, %zmm2 -; AVX512VL-NEXT: vpord %zmm2, %zmm7, %zmm2 -; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm9, %zmm3, %zmm3 +; AVX512VL-NEXT: vpord %zmm3, %zmm7, %zmm3 +; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm4, %ymm4 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm2 +; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VL-NEXT: vpsllvd %zmm4, %zmm5, %zmm4 ; AVX512VL-NEXT: vpsubw %ymm2, %ymm8, %ymm5 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm5, %zmm3, %zmm3 -; AVX512VL-NEXT: vpord %zmm3, %zmm4, %zmm3 -; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm5, %zmm1, %zmm1 +; AVX512VL-NEXT: vpord %zmm1, %zmm4, %zmm1 +; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1 ; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v32i16: @@ -252,81 +260,88 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> % define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm8 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm9 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm8 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm6 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm7 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm9 ; AVX512F-NEXT: vpsllw $5, %ymm9, %ymm10 -; AVX512F-NEXT: vpblendvb %ymm10, %ymm8, %ymm0, %ymm8 -; AVX512F-NEXT: vpsllw $2, %ymm8, %ymm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512F-NEXT: vpand %ymm4, %ymm11, %ymm11 +; AVX512F-NEXT: vpblendvb %ymm10, %ymm7, %ymm3, %ymm7 +; AVX512F-NEXT: vpsllw $2, %ymm7, %ymm11 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpand %ymm6, %ymm11, %ymm11 ; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10 -; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm8, %ymm8 -; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm11 +; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm7, %ymm7 +; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm11 ; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10 -; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm8, %ymm10 -; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm8, %ymm11, %ymm11 +; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm7, %ymm10 +; AVX512F-NEXT: vpsrlw $4, %ymm8, %ymm11 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm7, %ymm11, %ymm11 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpsubb %ymm9, %ymm12, %ymm13 ; AVX512F-NEXT: vpsllw $5, %ymm13, %ymm13 -; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm11 +; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm8, %ymm8 +; AVX512F-NEXT: vpsrlw $2, %ymm8, %ymm11 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512F-NEXT: vpand %ymm14, %ymm11, %ymm11 ; AVX512F-NEXT: vpaddb %ymm13, %ymm13, %ymm13 -; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm11 +; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm8, %ymm8 +; AVX512F-NEXT: vpsrlw $1, %ymm8, %ymm11 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm15, %ymm11, %ymm11 ; AVX512F-NEXT: vpaddb %ymm13, %ymm13, %ymm13 -; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm2, %ymm2 -; AVX512F-NEXT: vpor %ymm2, %ymm10, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm8, %ymm8 +; AVX512F-NEXT: vpor %ymm8, %ymm10, %ymm8 ; AVX512F-NEXT: vpxor %xmm10, %xmm10, %xmm10 ; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm9, %ymm9 -; AVX512F-NEXT: vpblendvb %ymm9, %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2 -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm9, %ymm3, %ymm8, %ymm3 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm8 +; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4 +; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4 +; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm8 +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm6 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm5 ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm2 -; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm7 -; AVX512F-NEXT: vpand %ymm4, %ymm7, %ymm4 -; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm4 -; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4 -; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4 -; AVX512F-NEXT: vpsubb %ymm5, %ymm12, %ymm6 +; AVX512F-NEXT: vpsubb %ymm2, %ymm12, %ymm6 ; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm4 -; AVX512F-NEXT: vpand %ymm14, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm5 +; AVX512F-NEXT: vpand %ymm14, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm4 -; AVX512F-NEXT: vpand %ymm15, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm5 +; AVX512F-NEXT: vpand %ymm15, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm5, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm6 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VL-NEXT: vpand %ymm8, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm9 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm6 +; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm9 +; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm6 ; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm10 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm11 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512VL-NEXT: vpand %ymm11, %ymm10, %ymm10 @@ -335,55 +350,56 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm10 ; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6 -; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm9 +; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm9 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm10, %ymm9, %ymm9 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %ymm4, %ymm12, %ymm13 +; AVX512VL-NEXT: vpsubb %ymm5, %ymm12, %ymm13 ; AVX512VL-NEXT: vpsllw $5, %ymm13, %ymm13 -; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm2, %ymm2 -; AVX512VL-NEXT: vpsrlw $2, %ymm2, %ymm9 +; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm9 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm14 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512VL-NEXT: vpand %ymm14, %ymm9, %ymm9 ; AVX512VL-NEXT: vpaddb %ymm13, %ymm13, %ymm13 -; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm2, %ymm2 -; AVX512VL-NEXT: vpsrlw $1, %ymm2, %ymm9 +; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm9 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm15 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512VL-NEXT: vpand %ymm15, %ymm9, %ymm9 ; AVX512VL-NEXT: vpaddb %ymm13, %ymm13, %ymm13 -; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm6, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm4, %ymm4 +; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4 ; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2 -; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2 -; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm4 -; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm2 -; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm7 +; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpand %ymm8, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm7 ; AVX512VL-NEXT: vpand %ymm11, %ymm7, %ymm7 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm2, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm7 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm2, %ymm2 -; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm5 ; AVX512VL-NEXT: vpand %ymm10, %ymm5, %ymm5 -; AVX512VL-NEXT: vpsubb %ymm4, %ymm12, %ymm7 +; AVX512VL-NEXT: vpsubb %ymm2, %ymm12, %ymm7 ; AVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7 -; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $2, %ymm3, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm5 ; AVX512VL-NEXT: vpand %ymm14, %ymm5, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7 -; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm5 ; AVX512VL-NEXT: vpand %ymm15, %ymm5, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7 -; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v64i8: @@ -706,42 +722,48 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm4, %ymm4 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm6 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-NEXT: vpsllw %xmm5, %ymm4, %ymm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %xmm4, %xmm7, %xmm7 +; AVX512F-NEXT: vpsubw %xmm2, %xmm7, %xmm7 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm7, %ymm2, %ymm2 -; AVX512F-NEXT: vpor %ymm2, %ymm6, %ymm2 -; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsllw %xmm5, %ymm1, %ymm2 ; AVX512F-NEXT: vpsrlw %xmm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm4 +; AVX512F-NEXT: vpsrlw %xmm7, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm4, %ymm4 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm6 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm5, %ymm4, %ymm6 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %xmm4, %xmm7, %xmm7 +; AVX512VL-NEXT: vpsubw %xmm2, %xmm7, %xmm7 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm7, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm6, %ymm2 -; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX512VL-NEXT: vpsllw %xmm5, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsrlw %xmm7, %ymm3, %ymm3 -; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm4 +; AVX512VL-NEXT: vpsrlw %xmm7, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: @@ -795,62 +817,68 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm6 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm9 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpsllw %xmm5, %ymm4, %ymm6 ; AVX512F-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8 ; AVX512F-NEXT: vpsllw %xmm5, %xmm8, %xmm7 ; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7 -; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm9 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512F-NEXT: vpsubb %xmm4, %xmm6, %xmm6 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm6, %ymm2, %ymm10 -; AVX512F-NEXT: vpsrlw %xmm6, %xmm8, %xmm2 -; AVX512F-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512F-NEXT: vpand %ymm2, %ymm10, %ymm8 -; AVX512F-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm10 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm3 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpsrlw %xmm3, %ymm9, %ymm9 +; AVX512F-NEXT: vpsrlw %xmm3, %xmm8, %xmm6 +; AVX512F-NEXT: vpsrlw $8, %xmm6, %xmm6 +; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6 +; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm8 +; AVX512F-NEXT: vpor %ymm8, %ymm10, %ymm8 ; AVX512F-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX512F-NEXT: vpcmpeqb %ymm9, %ymm4, %ymm4 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm0, %ymm8, %ymm0 -; AVX512F-NEXT: vpsllw %xmm5, %ymm1, %ymm5 +; AVX512F-NEXT: vpcmpeqb %ymm9, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4 +; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm5 ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpsrlw %xmm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm6 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm9 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm5, %ymm4, %ymm6 ; AVX512VL-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8 ; AVX512VL-NEXT: vpsllw %xmm5, %xmm8, %xmm7 ; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7 -; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm9 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm4, %xmm6, %xmm6 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm6, %ymm2, %ymm10 -; AVX512VL-NEXT: vpsrlw %xmm6, %xmm8, %xmm2 -; AVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512VL-NEXT: vpand %ymm2, %ymm10, %ymm8 -; AVX512VL-NEXT: vpor %ymm8, %ymm9, %ymm8 +; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm10 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm3, %ymm9, %ymm9 +; AVX512VL-NEXT: vpsrlw %xmm3, %xmm8, %xmm6 +; AVX512VL-NEXT: vpsrlw $8, %xmm6, %xmm6 +; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6 +; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm8 +; AVX512VL-NEXT: vpor %ymm8, %ymm10, %ymm8 ; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX512VL-NEXT: vpcmpeqb %ymm9, %ymm4, %ymm4 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm8, %ymm0 -; AVX512VL-NEXT: vpsllw %xmm5, %ymm1, %ymm5 +; AVX512VL-NEXT: vpcmpeqb %ymm9, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4 +; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm5 ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512VL-NEXT: vpsrlw %xmm6, %ymm3, %ymm3 -; AVX512VL-NEXT: vpand %ymm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: @@ -1044,32 +1072,38 @@ define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) nounwin define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; AVX512F-LABEL: constant_funnnel_v32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512F-NEXT: vpmulhuw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm5 -; AVX512F-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vpmulhuw %ymm4, %ymm3, %ymm2 -; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm3 -; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vpmulhuw %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm5 +; AVX512F-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512F-NEXT: vpmulhuw %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm3 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_funnnel_v32i16: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm2, %ymm2 -; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm5 -; AVX512VL-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm3, %ymm2 -; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm3 -; AVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpmullw %ymm4, %ymm2, %ymm5 +; AVX512VL-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512VL-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v32i16: @@ -1110,12 +1144,14 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwin define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512F-LABEL: constant_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] ; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm4 ; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm7 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512F-NEXT: vpand %ymm8, %ymm7, %ymm7 @@ -1138,34 +1174,37 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512F-NEXT: vpackuswb %ymm11, %ymm2, %ymm2 ; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] -; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2 -; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm2 -; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3 +; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm5 ; AVX512F-NEXT: vpand %ymm8, %ymm5, %ymm5 -; AVX512F-NEXT: vpblendvb %ymm9, %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5 -; AVX512F-NEXT: vpblendvb %ymm10, %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11],ymm3[12],ymm7[12],ymm3[13],ymm7[13],ymm3[14],ymm7[14],ymm3[15],ymm7[15],ymm3[24],ymm7[24],ymm3[25],ymm7[25],ymm3[26],ymm7[26],ymm3[27],ymm7[27],ymm3[28],ymm7[28],ymm3[29],ymm7[29],ymm3[30],ymm7[30],ymm3[31],ymm7[31] +; AVX512F-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm10, %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15],ymm1[24],ymm7[24],ymm1[25],ymm7[25],ymm1[26],ymm7[26],ymm1[27],ymm7[27],ymm1[28],ymm7[28],ymm1[29],ymm7[29],ymm1[30],ymm7[30],ymm1[31],ymm7[31] ; AVX512F-NEXT: vpmullw %ymm12, %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[16],ymm7[16],ymm3[17],ymm7[17],ymm3[18],ymm7[18],ymm3[19],ymm7[19],ymm3[20],ymm7[20],ymm3[21],ymm7[21],ymm3[22],ymm7[22],ymm3[23],ymm7[23] -; AVX512F-NEXT: vpmullw %ymm13, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 -; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[16],ymm7[16],ymm1[17],ymm7[17],ymm1[18],ymm7[18],ymm1[19],ymm7[19],ymm1[20],ymm7[20],ymm1[21],ymm7[21],ymm1[22],ymm7[22],ymm1[23],ymm7[23] +; AVX512F-NEXT: vpmullw %ymm13, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] ; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm4 ; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm7 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512VL-NEXT: vpand %ymm8, %ymm7, %ymm7 @@ -1189,26 +1228,27 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512VL-NEXT: vpackuswb %ymm7, %ymm2, %ymm2 ; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2 -; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm2 -; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm5 ; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm10, %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm10, %ymm5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512VL-NEXT: vpmullw %ymm11, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpmullw %ymm12, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmullw %ymm12, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v64i8: @@ -1433,22 +1473,28 @@ define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) no define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; AVX512F-LABEL: splatconstant_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $9, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpsrlw $9, %ymm3, %ymm3 +; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2 +; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm1 ; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $9, %ymm3, %ymm2 -; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $9, %ymm2, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-NEXT: vpsrlw $9, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $9, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v32i16: @@ -1481,32 +1527,38 @@ define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) no define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512F-LABEL: splatconstant_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512F-NEXT: vpandn %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm2 -; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm3 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; AVX512VL-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpandn %ymm1, %ymm4, %ymm1 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm2 -; AVX512VL-NEXT: vpandn %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v64i8: diff --git a/test/CodeGen/X86/vector-fshl-rot-512.ll b/test/CodeGen/X86/vector-fshl-rot-512.ll index de7959d6b5a..f6302313c0b 100644 --- a/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -34,50 +34,56 @@ define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind { define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpsllvd %zmm5, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm0, %zmm5, %zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpsllvd %zmm5, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpord %zmm2, %zmm5, %zmm2 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpsllvd %zmm3, %zmm1, %zmm3 -; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1 -; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v32i16: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpsllvd %zmm5, %zmm0, %zmm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %ymm2, %ymm6, %ymm2 +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpord %zmm0, %zmm5, %zmm0 -; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm2 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512VL-NEXT: vpsllvd %zmm5, %zmm2, %zmm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpsubw %ymm3, %ymm6, %ymm3 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2 +; AVX512VL-NEXT: vpord %zmm2, %zmm5, %zmm2 +; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3 +; AVX512VL-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpsllvd %zmm3, %zmm1, %zmm3 -; AVX512VL-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1 -; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1 -; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0 +; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v32i16: @@ -110,102 +116,108 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6 ; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512F-NEXT: vpandn %ymm4, %ymm7, %ymm4 -; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm8 +; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm8 ; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm8 ; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm9 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm9 ; AVX512F-NEXT: vpor %ymm4, %ymm9, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2 -; AVX512F-NEXT: vpandn %ymm2, %ymm5, %ymm2 -; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512F-NEXT: vpandn %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $6, %ymm1, %ymm2 -; AVX512F-NEXT: vpandn %ymm2, %ymm7, %ymm2 -; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm4 +; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 +; AVX512F-NEXT: vpandn %ymm3, %ymm7, %ymm3 +; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2 -; AVX512F-NEXT: vpand %ymm8, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm4 -; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX512F-NEXT: vpand %ymm8, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4 +; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm6 ; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm6 ; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512VL-NEXT: vpandn %ymm4, %ymm7, %ymm4 -; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm8 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm8 ; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm8 ; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-NEXT: vpand %ymm8, %ymm4, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm9 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm9 ; AVX512VL-NEXT: vpor %ymm4, %ymm9, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 -; AVX512VL-NEXT: vpandn %ymm2, %ymm5, %ymm2 -; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpandn %ymm3, %ymm5, %ymm3 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $6, %ymm1, %ymm2 -; AVX512VL-NEXT: vpandn %ymm2, %ymm7, %ymm2 -; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm4 +; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 +; AVX512VL-NEXT: vpandn %ymm3, %ymm7, %ymm3 +; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 ; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2 -; AVX512VL-NEXT: vpand %ymm8, %ymm2, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm4 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand %ymm8, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 +; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v64i8: @@ -316,34 +328,38 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounw define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %xmm2, %xmm5, %xmm2 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %xmm2, %xmm5, %xmm2 -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0 -; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3 -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: @@ -383,54 +399,58 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 ; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6 ; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512F-NEXT: vpsubb %xmm2, %xmm7, %xmm2 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw %xmm2, %xmm5, %xmm5 +; AVX512F-NEXT: vpsubb %xmm1, %xmm7, %xmm1 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 ; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5 ; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 ; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6 ; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm2, %xmm7, %xmm2 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw %xmm2, %xmm5, %xmm5 +; AVX512VL-NEXT: vpsubb %xmm1, %xmm7, %xmm1 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 ; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5 ; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0 -; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: @@ -508,34 +528,38 @@ define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x) nounwind { define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] +; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] -; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_funnnel_v32i16: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3 -; AVX512VL-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 +; AVX512VL-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] -; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 +; AVX512VL-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v32i16: @@ -558,12 +582,13 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind { define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 @@ -573,44 +598,46 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31] +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] ; AVX512F-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9 ; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23] ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] ; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm9, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2 +; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm9, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31] ; AVX512F-NEXT: vpmullw %ymm10, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23] -; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23] +; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 @@ -619,38 +646,39 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] ; AVX512VL-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512VL-NEXT: vpmullw %ymm9, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] ; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpackuswb %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2 +; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v64i8: @@ -737,22 +765,26 @@ define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x) nounwind { define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind { ; AVX512F-LABEL: splatconstant_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2 -; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm2 ; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2 +; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2 -; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v32i16: @@ -775,32 +807,36 @@ define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind { define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512F-LABEL: splatconstant_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v64i8: diff --git a/test/CodeGen/X86/vector-fshr-512.ll b/test/CodeGen/X86/vector-fshr-512.ll index 26dbc92918d..893260e1dd6 100644 --- a/test/CodeGen/X86/vector-fshr-512.ll +++ b/test/CodeGen/X86/vector-fshr-512.ll @@ -146,64 +146,72 @@ define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> % define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512F-NEXT: vpsrlvd %zmm7, %zmm8, %zmm7 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %ymm4, %ymm8, %ymm9 +; AVX512F-NEXT: vpsubw %ymm5, %ymm8, %ymm9 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpsllvd %zmm9, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm7, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512F-NEXT: vpsllvd %zmm9, %zmm3, %zmm3 +; AVX512F-NEXT: vpord %zmm7, %zmm3, %zmm3 +; AVX512F-NEXT: vpmovdw %zmm3, %ymm3 ; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm4, %ymm4 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm2 +; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm5, %zmm4 ; AVX512F-NEXT: vpsubw %ymm2, %ymm8, %ymm5 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpsllvd %zmm5, %zmm1, %zmm1 -; AVX512F-NEXT: vpord %zmm4, %zmm1, %zmm1 -; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpsllvd %zmm5, %zmm0, %zmm0 +; AVX512F-NEXT: vpord %zmm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v32i16: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512VL-NEXT: vpsrlvd %zmm7, %zmm8, %zmm7 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %ymm4, %ymm8, %ymm9 +; AVX512VL-NEXT: vpsubw %ymm5, %ymm8, %ymm9 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpsllvd %zmm9, %zmm0, %zmm0 -; AVX512VL-NEXT: vpord %zmm7, %zmm0, %zmm0 -; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512VL-NEXT: vpsllvd %zmm9, %zmm3, %zmm3 +; AVX512VL-NEXT: vpord %zmm7, %zmm3, %zmm3 +; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm4, %ymm4 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm2 +; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm5, %zmm4 ; AVX512VL-NEXT: vpsubw %ymm2, %ymm8, %ymm5 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpsllvd %zmm5, %zmm1, %zmm1 -; AVX512VL-NEXT: vpord %zmm4, %zmm1, %zmm1 -; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpsllvd %zmm5, %zmm0, %zmm0 +; AVX512VL-NEXT: vpord %zmm4, %zmm0, %zmm0 +; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v32i16: @@ -248,81 +256,88 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> % define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm8 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm9 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm8 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm6 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm7 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm9 ; AVX512F-NEXT: vpsllw $5, %ymm9, %ymm10 -; AVX512F-NEXT: vpblendvb %ymm10, %ymm8, %ymm2, %ymm8 -; AVX512F-NEXT: vpsrlw $2, %ymm8, %ymm11 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX512F-NEXT: vpand %ymm4, %ymm11, %ymm11 +; AVX512F-NEXT: vpblendvb %ymm10, %ymm7, %ymm3, %ymm7 +; AVX512F-NEXT: vpsrlw $2, %ymm7, %ymm11 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpand %ymm6, %ymm11, %ymm11 ; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10 -; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm8, %ymm11 +; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm7, %ymm11 ; AVX512F-NEXT: vpsrlw $1, %ymm11, %ymm12 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm8, %ymm12, %ymm12 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpand %ymm7, %ymm12, %ymm12 ; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10 ; AVX512F-NEXT: vpblendvb %ymm10, %ymm12, %ymm11, %ymm10 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm11 +; AVX512F-NEXT: vpsllw $4, %ymm8, %ymm11 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm12, %ymm11, %ymm11 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpsubb %ymm9, %ymm13, %ymm14 ; AVX512F-NEXT: vpsllw $5, %ymm14, %ymm14 -; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm11 +; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm8, %ymm8 +; AVX512F-NEXT: vpsllw $2, %ymm8, %ymm11 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512F-NEXT: vpand %ymm15, %ymm11, %ymm11 ; AVX512F-NEXT: vpaddb %ymm14, %ymm14, %ymm14 -; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm11 +; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm8, %ymm8 +; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm11 ; AVX512F-NEXT: vpaddb %ymm14, %ymm14, %ymm14 -; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm10, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm8, %ymm8 +; AVX512F-NEXT: vpor %ymm10, %ymm8, %ymm8 ; AVX512F-NEXT: vpxor %xmm10, %xmm10, %xmm10 ; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm9, %ymm9 -; AVX512F-NEXT: vpblendvb %ymm9, %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm2 -; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm7 -; AVX512F-NEXT: vpand %ymm4, %ymm7, %ymm4 -; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm4 -; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm4 -; AVX512F-NEXT: vpand %ymm12, %ymm4, %ymm4 -; AVX512F-NEXT: vpsubb %ymm5, %ymm13, %ymm6 +; AVX512F-NEXT: vpblendvb %ymm9, %ymm3, %ymm8, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm8 +; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4 +; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm4 +; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm8 +; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm6 +; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6 +; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5 +; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5 +; AVX512F-NEXT: vpand %ymm12, %ymm5, %ymm5 +; AVX512F-NEXT: vpsubb %ymm2, %ymm13, %ymm6 ; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm4 -; AVX512F-NEXT: vpand %ymm15, %ymm4, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm5 +; AVX512F-NEXT: vpand %ymm15, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm5 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm5, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm6 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm6 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VL-NEXT: vpand %ymm8, %ymm4, %ymm4 -; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm9 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm2, %ymm6 +; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm9 +; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm6 ; AVX512VL-NEXT: vpsrlw $2, %ymm6, %ymm10 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512VL-NEXT: vpand %ymm11, %ymm10, %ymm10 @@ -333,53 +348,54 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) ; AVX512VL-NEXT: vpand %ymm12, %ymm10, %ymm10 ; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9 ; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm9 +; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm9 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm10, %ymm9, %ymm9 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %ymm4, %ymm13, %ymm14 +; AVX512VL-NEXT: vpsubb %ymm5, %ymm13, %ymm14 ; AVX512VL-NEXT: vpsllw $5, %ymm14, %ymm14 -; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm9 +; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm9 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm15 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512VL-NEXT: vpand %ymm15, %ymm9, %ymm9 ; AVX512VL-NEXT: vpaddb %ymm14, %ymm14, %ymm14 -; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm9 +; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm4, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm9 ; AVX512VL-NEXT: vpaddb %ymm14, %ymm14, %ymm14 -; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm6, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm4, %ymm4 +; AVX512VL-NEXT: vpor %ymm6, %ymm4, %ymm4 ; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm2 -; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2 -; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm4 -; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsrlw $2, %ymm2, %ymm7 +; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm4 +; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpand %ymm8, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm4 +; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm7 ; AVX512VL-NEXT: vpand %ymm11, %ymm7, %ymm7 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm2, %ymm2 -; AVX512VL-NEXT: vpsrlw $1, %ymm2, %ymm7 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm7 ; AVX512VL-NEXT: vpand %ymm12, %ymm7, %ymm7 ; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5 -; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm2, %ymm2 -; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 ; AVX512VL-NEXT: vpand %ymm10, %ymm5, %ymm5 -; AVX512VL-NEXT: vpsubb %ymm4, %ymm13, %ymm7 +; AVX512VL-NEXT: vpsubb %ymm2, %ymm13, %ymm7 ; AVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7 -; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm5 ; AVX512VL-NEXT: vpand %ymm15, %ymm5, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7 -; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7 -; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v64i8: @@ -694,42 +710,48 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm4, %ymm4 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm5, %ymm2, %ymm6 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2 +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512F-NEXT: vpsrlw %xmm5, %ymm4, %ymm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %xmm4, %xmm7, %xmm7 +; AVX512F-NEXT: vpsubw %xmm2, %xmm7, %xmm7 ; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm7, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw %xmm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpor %ymm6, %ymm3, %ymm3 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw %xmm5, %ymm3, %ymm2 -; AVX512F-NEXT: vpsllw %xmm7, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw %xmm5, %ymm1, %ymm4 +; AVX512F-NEXT: vpsllw %xmm7, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm4, %ymm4 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm5, %ymm2, %ymm6 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm5, %ymm4, %ymm6 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %xmm4, %xmm7, %xmm7 +; AVX512VL-NEXT: vpsubw %xmm2, %xmm7, %xmm7 ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm7, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm6, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllw %xmm7, %ymm3, %ymm3 +; AVX512VL-NEXT: vpor %ymm6, %ymm3, %ymm3 ; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw %xmm5, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsllw %xmm7, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw %xmm5, %ymm1, %ymm4 +; AVX512VL-NEXT: vpsllw %xmm7, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: @@ -783,62 +805,68 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm5, %ymm2, %ymm6 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm9 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpsrlw %xmm5, %ymm4, %ymm6 ; AVX512F-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8 ; AVX512F-NEXT: vpsrlw %xmm5, %xmm8, %xmm7 ; AVX512F-NEXT: vpsrlw $8, %xmm7, %xmm7 ; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7 -; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm9 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512F-NEXT: vpsubb %xmm4, %xmm6, %xmm6 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm6, %ymm0, %ymm10 -; AVX512F-NEXT: vpsllw %xmm6, %xmm8, %xmm0 -; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm8 -; AVX512F-NEXT: vpand %ymm8, %ymm10, %ymm0 -; AVX512F-NEXT: vpor %ymm9, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm10 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm3 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpsllw %xmm3, %ymm9, %ymm9 +; AVX512F-NEXT: vpsllw %xmm3, %xmm8, %xmm6 +; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6 +; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm8 +; AVX512F-NEXT: vpor %ymm10, %ymm8, %ymm8 ; AVX512F-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX512F-NEXT: vpcmpeqb %ymm9, %ymm4, %ymm4 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw %xmm5, %ymm3, %ymm2 -; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2 -; AVX512F-NEXT: vpsllw %xmm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm8, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpcmpeqb %ymm9, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4 +; AVX512F-NEXT: vpsrlw %xmm5, %ymm1, %ymm5 +; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm5, %ymm2, %ymm6 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm9 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm5, %ymm4, %ymm6 ; AVX512VL-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8 ; AVX512VL-NEXT: vpsrlw %xmm5, %xmm8, %xmm7 ; AVX512VL-NEXT: vpsrlw $8, %xmm7, %xmm7 ; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7 -; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm9 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm4, %xmm6, %xmm6 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm6, %ymm0, %ymm10 -; AVX512VL-NEXT: vpsllw %xmm6, %xmm8, %xmm0 -; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm8 -; AVX512VL-NEXT: vpand %ymm8, %ymm10, %ymm0 -; AVX512VL-NEXT: vpor %ymm9, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm10 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm3, %ymm9, %ymm9 +; AVX512VL-NEXT: vpsllw %xmm3, %xmm8, %xmm6 +; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6 +; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm8 +; AVX512VL-NEXT: vpor %ymm10, %ymm8, %ymm8 ; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX512VL-NEXT: vpcmpeqb %ymm9, %ymm4, %ymm4 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw %xmm5, %ymm3, %ymm2 -; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2 -; AVX512VL-NEXT: vpsllw %xmm6, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm9, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4 +; AVX512VL-NEXT: vpsrlw %xmm5, %ymm1, %ymm5 +; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm6, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: @@ -1032,32 +1060,38 @@ define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) nounwin define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; AVX512F-LABEL: constant_funnnel_v32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512F-NEXT: vpmulhuw %ymm4, %ymm2, %ymm5 +; AVX512F-NEXT: vpmulhuw %ymm4, %ymm3, %ymm5 +; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpor %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vpmulhuw %ymm4, %ymm1, %ymm3 ; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vpmulhuw %ymm4, %ymm3, %ymm2 -; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_funnnel_v32i16: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm2, %ymm5 +; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm3, %ymm5 +; AVX512VL-NEXT: vpmullw %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm1, %ymm3 ; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm3, %ymm2 -; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v32i16: @@ -1098,20 +1132,22 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwin define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512F-LABEL: constant_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] ; AVX512F-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 ; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm8 -; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4 ; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm9 -; AVX512F-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] @@ -1124,44 +1160,47 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512F-NEXT: vpmullw %ymm13, %ymm12, %ymm12 ; AVX512F-NEXT: vpsrlw $8, %ymm12, %ymm12 ; AVX512F-NEXT: vpackuswb %ymm10, %ymm12, %ymm10 -; AVX512F-NEXT: vpor %ymm10, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm10, %ymm3, %ymm3 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] -; AVX512F-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2 -; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm2 -; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] -; AVX512F-NEXT: vpmullw %ymm11, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] +; AVX512F-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3 +; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3 +; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm9, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31] +; AVX512F-NEXT: vpmullw %ymm11, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23] ; AVX512F-NEXT: vpmullw %ymm13, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] ; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm8 -; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm9 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] @@ -1175,28 +1214,29 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512VL-NEXT: vpmullw %ymm12, %ymm11, %ymm11 ; AVX512VL-NEXT: vpsrlw $8, %ymm11, %ymm11 ; AVX512VL-NEXT: vpackuswb %ymm4, %ymm11, %ymm4 -; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm4, %ymm3, %ymm3 ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 -; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2 -; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm2 -; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512VL-NEXT: vpmullw %ymm10, %ymm2, %ymm2 -; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm9, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 +; AVX512VL-NEXT: vpmullw %ymm10, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512VL-NEXT: vpmullw %ymm12, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512VL-NEXT: vpackuswb %ymm2, %ymm5, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpackuswb %ymm3, %ymm5, %ymm3 +; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v64i8: @@ -1417,22 +1457,28 @@ define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y) no define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind { ; AVX512F-LABEL: splatconstant_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm3 +; AVX512F-NEXT: vpsllw $9, %ymm2, %ymm2 +; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm1 ; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm2 -; AVX512F-NEXT: vpsllw $9, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsllw $9, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsllw $9, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v32i16: @@ -1465,32 +1511,38 @@ define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) no define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind { ; AVX512F-LABEL: splatconstant_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm2 +; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512F-NEXT: vpandn %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm2 -; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm3 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpandn %ymm2, %ymm4, %ymm2 +; AVX512VL-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpandn %ymm1, %ymm4, %ymm1 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm2 -; AVX512VL-NEXT: vpandn %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v64i8: diff --git a/test/CodeGen/X86/vector-fshr-rot-512.ll b/test/CodeGen/X86/vector-fshr-rot-512.ll index 2a25efd50ff..84d14b6e3cf 100644 --- a/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -34,56 +34,62 @@ define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounwind { define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpsubw %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpsllvd %zmm6, %zmm0, %zmm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %ymm2, %ymm7, %ymm2 +; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm0, %zmm6, %zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpsubw %ymm3, %ymm4, %ymm2 -; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpsllvd %zmm6, %zmm2, %zmm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpsubw %ymm3, %ymm7, %ymm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpord %zmm2, %zmm6, %zmm2 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm1 +; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vpsubw %ymm1, %ymm7, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpsllvd %zmm3, %zmm1, %zmm3 -; AVX512F-NEXT: vpsubw %ymm2, %ymm7, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1 -; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v32i16: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm2 +; AVX512VL-NEXT: vpsubw %ymm3, %ymm4, %ymm3 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpsllvd %zmm6, %zmm0, %zmm6 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %ymm2, %ymm7, %ymm2 +; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpord %zmm0, %zmm6, %zmm0 -; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VL-NEXT: vpsubw %ymm3, %ymm4, %ymm2 -; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512VL-NEXT: vpsllvd %zmm6, %zmm2, %zmm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpsubw %ymm3, %ymm7, %ymm3 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2 +; AVX512VL-NEXT: vpord %zmm2, %zmm6, %zmm2 +; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512VL-NEXT: vpsubw %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3 +; AVX512VL-NEXT: vpsubw %ymm1, %ymm7, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpsllvd %zmm3, %zmm1, %zmm3 -; AVX512VL-NEXT: vpsubw %ymm2, %ymm7, %ymm2 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1 -; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1 -; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0 +; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v32i16: @@ -116,10 +122,12 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6 ; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 @@ -127,52 +135,55 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512F-NEXT: vpandn %ymm4, %ymm8, %ymm4 -; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm9 +; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm9 ; AVX512F-NEXT: vpand %ymm8, %ymm9, %ymm9 ; AVX512F-NEXT: vpor %ymm4, %ymm9, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm9, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm10 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm10 ; AVX512F-NEXT: vpor %ymm4, %ymm10, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2 -; AVX512F-NEXT: vpandn %ymm2, %ymm5, %ymm2 -; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512F-NEXT: vpandn %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsubb %ymm3, %ymm6, %ymm3 -; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $6, %ymm1, %ymm2 -; AVX512F-NEXT: vpandn %ymm2, %ymm8, %ymm2 -; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm4 +; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1 +; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 +; AVX512F-NEXT: vpandn %ymm3, %ymm8, %ymm3 +; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4 -; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2 -; AVX512F-NEXT: vpand %ymm9, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm4 -; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX512F-NEXT: vpand %ymm9, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4 +; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm6 ; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm6 ; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4 ; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6 @@ -180,44 +191,45 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512VL-NEXT: vpandn %ymm4, %ymm8, %ymm4 -; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm9 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm9 ; AVX512VL-NEXT: vpand %ymm8, %ymm9, %ymm9 ; AVX512VL-NEXT: vpor %ymm4, %ymm9, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-NEXT: vpand %ymm9, %ymm4, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm10 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm10 ; AVX512VL-NEXT: vpor %ymm4, %ymm10, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 -; AVX512VL-NEXT: vpandn %ymm2, %ymm5, %ymm2 -; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpandn %ymm3, %ymm5, %ymm3 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpsubb %ymm3, %ymm6, %ymm3 -; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $6, %ymm1, %ymm2 -; AVX512VL-NEXT: vpandn %ymm2, %ymm8, %ymm2 -; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm4 +; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpsubb %ymm1, %ymm6, %ymm1 +; AVX512VL-NEXT: vpand %ymm7, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 +; AVX512VL-NEXT: vpandn %ymm3, %ymm8, %ymm3 +; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 ; AVX512VL-NEXT: vpand %ymm8, %ymm4, %ymm4 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2 -; AVX512VL-NEXT: vpand %ymm9, %ymm2, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm4 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand %ymm9, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 +; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v64i8: @@ -328,38 +340,42 @@ define <16 x i32> @splatvar_funnnel_v16i32(<16 x i32> %x, <16 x i32> %amt) nounw define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsubw %xmm2, %xmm3, %xmm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4 +; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %xmm2, %xmm5, %xmm2 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512VL-NEXT: vpsubw %xmm2, %xmm3, %xmm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4 +; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %xmm2, %xmm5, %xmm2 -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0 -; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3 -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i16: @@ -399,58 +415,62 @@ define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounw define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4 +; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 ; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6 ; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512F-NEXT: vpsubb %xmm2, %xmm7, %xmm2 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw %xmm2, %xmm5, %xmm5 +; AVX512F-NEXT: vpsubb %xmm1, %xmm7, %xmm1 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 ; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5 ; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4 +; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 ; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6 ; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm2, %xmm7, %xmm2 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw %xmm2, %xmm5, %xmm5 +; AVX512VL-NEXT: vpsubb %xmm1, %xmm7, %xmm1 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 ; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5 ; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0 -; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: @@ -528,34 +548,38 @@ define <16 x i32> @constant_funnnel_v16i32(<16 x i32> %x) nounwind { define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] +; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15] ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] -; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2 -; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_funnnel_v32i16: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3 -; AVX512VL-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 +; AVX512VL-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15] ; AVX512VL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2] -; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2 -; AVX512VL-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 +; AVX512VL-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v32i16: @@ -578,12 +602,13 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind { define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 @@ -593,44 +618,46 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31] +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] ; AVX512F-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9 ; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23] ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] ; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm9, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2 +; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm9, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31] ; AVX512F-NEXT: vpmullw %ymm10, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23] -; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23] +; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536] ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 @@ -639,38 +666,39 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] ; AVX512VL-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512VL-NEXT: vpmullw %ymm9, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] ; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpackuswb %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2 +; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_funnnel_v64i8: @@ -757,22 +785,26 @@ define <16 x i32> @splatconstant_funnnel_v16i32(<16 x i32> %x) nounwind { define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind { ; AVX512F-LABEL: splatconstant_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2 ; AVX512F-NEXT: vpsllw $9, %ymm1, %ymm1 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 +; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 -; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsllw $9, %ymm1, %ymm1 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v32i16: @@ -795,32 +827,36 @@ define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind { define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind { ; AVX512F-LABEL: splatconstant_funnnel_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_funnnel_v64i8: diff --git a/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/test/CodeGen/X86/vector-idiv-sdiv-512.ll index 961bec56e5d..336311e1b79 100644 --- a/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -102,15 +102,17 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind { define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: test_div7_32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] -; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm3 -; AVX512F-NEXT: vpsraw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $15, %ymm1, %ymm2 +; AVX512F-NEXT: vpsrlw $15, %ymm1, %ymm3 ; AVX512F-NEXT: vpsraw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm2 +; AVX512F-NEXT: vpsraw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_div7_32i16: @@ -127,44 +129,46 @@ define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind { define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind { ; AVX512F-LABEL: test_div7_64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm4 +; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm4 ; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 +; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512F-NEXT: vpxor %ymm6, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpsubb %ymm6, %ymm0, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpxor %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2 ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm7 +; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm7 ; AVX512F-NEXT: vpmullw %ymm3, %ymm7, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2 +; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2 ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpxor %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsubb %ymm6, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_div7_64i8: @@ -201,66 +205,68 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind { define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-LABEL: test_divconstant_64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm3 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 -; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4 +; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2 +; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpsraw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpsraw $8, %ymm4, %ymm4 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm0, %ymm3, %ymm0 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm3 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5 -; AVX512F-NEXT: vpand %ymm2, %ymm5, %ymm2 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX512F-NEXT: vpand %ymm1, %ymm5, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1 +; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512F-NEXT: vpsraw $8, %ymm1, %ymm1 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] -; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpsraw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpsraw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_divconstant_64i8: @@ -435,20 +441,22 @@ define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind { define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: test_rem7_32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] -; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm3 +; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpsrlw $15, %ymm3, %ymm4 ; AVX512F-NEXT: vpsraw $1, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddw %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm2 +; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm3 ; AVX512F-NEXT: vpsraw $1, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_rem7_32i16: @@ -467,17 +475,18 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind { define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512F-LABEL: test_rem7_64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427] ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm4 +; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm4 ; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 @@ -492,17 +501,17 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4 ; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2 ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm4 +; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm4 ; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpackuswb %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm3 ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2 @@ -513,7 +522,8 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm3 ; AVX512F-NEXT: vpand %ymm8, %ymm3, %ymm3 ; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_rem7_64i8: @@ -554,19 +564,20 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-LABEL: test_remconstant_64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm3 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 -; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4 +; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm5 +; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm5 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4 @@ -587,24 +598,24 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpaddb %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5 -; AVX512F-NEXT: vpand %ymm2, %ymm5, %ymm5 +; AVX512F-NEXT: vpand %ymm1, %ymm5, %ymm5 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 -; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4 +; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpsubb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 -; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5 -; AVX512F-NEXT: vpand %ymm2, %ymm5, %ymm5 +; AVX512F-NEXT: vpand %ymm1, %ymm5, %ymm5 ; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX512F-NEXT: vpmovsxbw %xmm5, %ymm5 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm6 +; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm6 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm6, %ymm6 ; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm6, %ymm5 @@ -624,12 +635,13 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm3 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 -; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4 +; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 -; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_remconstant_64i8: diff --git a/test/CodeGen/X86/vector-idiv-udiv-512.ll b/test/CodeGen/X86/vector-idiv-udiv-512.ll index 495d35a0c84..497c09943b7 100644 --- a/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -103,17 +103,19 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind { define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: test_div7_32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0 -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1 +; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_div7_32i16: @@ -131,36 +133,38 @@ define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind { define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind { ; AVX512F-LABEL: test_div7_64i8: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] ; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3 -; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] +; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm6, %ymm6 ; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] ; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpsubb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_div7_64i8: @@ -192,61 +196,63 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind { define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-LABEL: test_divconstant_64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] +; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] -; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] -; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] +; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] -; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 -; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] -; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_divconstant_64i8: @@ -433,22 +439,24 @@ define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind { define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: test_rem7_32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm4 +; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4 ; AVX512F-NEXT: vpaddw %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2 ; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_rem7_32i16: @@ -468,16 +476,17 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind { define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512F-LABEL: test_rem7_64i8: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] ; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3 -; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm5 +; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm5 ; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm5 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 @@ -489,15 +498,15 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; AVX512F-NEXT: vpand %ymm8, %ymm7, %ymm7 ; AVX512F-NEXT: vpsubb %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] +; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] ; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vpsubb %ymm2, %ymm0, %ymm3 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 @@ -506,7 +515,8 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm3 ; AVX512F-NEXT: vpand %ymm8, %ymm3, %ymm3 ; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_rem7_64i8: @@ -542,72 +552,74 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind { ; AVX512F-LABEL: test_remconstant_64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] -; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31] +; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm4 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31] +; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpsubb %ymm2, %ymm3, %ymm4 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15],ymm4[24],ymm1[24],ymm4[25],ymm1[25],ymm4[26],ymm1[26],ymm4[27],ymm1[27],ymm4[28],ymm1[28],ymm4[29],ymm1[29],ymm4[30],ymm1[30],ymm4[31],ymm1[31] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddb %ymm3, %ymm4, %ymm4 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31] -; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23] +; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm4 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15],ymm4[24],ymm1[24],ymm4[25],ymm1[25],ymm4[26],ymm1[26],ymm4[27],ymm1[27],ymm4[28],ymm1[28],ymm4[29],ymm1[29],ymm4[30],ymm1[30],ymm4[31],ymm1[31] +; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpand %ymm2, %ymm5, %ymm5 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 -; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4 +; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpsubb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] +; AVX512F-NEXT: vpsubb %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vpsubb %ymm4, %ymm1, %ymm5 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15],ymm5[24],ymm2[24],ymm5[25],ymm2[25],ymm5[26],ymm2[26],ymm5[27],ymm2[27],ymm5[28],ymm2[28],ymm5[29],ymm2[29],ymm5[30],ymm2[30],ymm5[31],ymm2[31] +; AVX512F-NEXT: vpsubb %ymm4, %ymm0, %ymm5 +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15],ymm5[24],ymm1[24],ymm5[25],ymm1[25],ymm5[26],ymm1[26],ymm5[27],ymm1[27],ymm5[28],ymm1[28],ymm5[29],ymm1[29],ymm5[30],ymm1[30],ymm5[31],ymm1[31] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm6, %ymm6 ; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[16],ymm2[16],ymm5[17],ymm2[17],ymm5[18],ymm2[18],ymm5[19],ymm2[19],ymm5[20],ymm2[20],ymm5[21],ymm2[21],ymm5[22],ymm2[22],ymm5[23],ymm2[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[16],ymm1[16],ymm5[17],ymm1[17],ymm5[18],ymm1[18],ymm5[19],ymm1[19],ymm5[20],ymm1[20],ymm5[21],ymm1[21],ymm5[22],ymm1[22],ymm5[23],ymm1[23] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512F-NEXT: vpackuswb %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddb %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31] +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15],ymm4[24],ymm1[24],ymm4[25],ymm1[25],ymm4[26],ymm1[26],ymm4[27],ymm1[27],ymm4[28],ymm1[28],ymm4[29],ymm1[29],ymm4[30],ymm1[30],ymm4[31],ymm1[31] ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5 -; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm5 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23] -; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpackuswb %ymm5, %ymm2, %ymm2 -; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm2, %ymm5, %ymm5 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23] +; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 +; AVX512F-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: test_remconstant_64i8: diff --git a/test/CodeGen/X86/vector-lzcnt-512.ll b/test/CodeGen/X86/vector-lzcnt-512.ll index 7c54724f5b5..5a9aadd209c 100644 --- a/test/CodeGen/X86/vector-lzcnt-512.ll +++ b/test/CodeGen/X86/vector-lzcnt-512.ll @@ -331,15 +331,17 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512CD-LABEL: testv32i16: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512CD-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512CD-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv32i16: @@ -378,9 +380,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; ; AVX512DQ-LABEL: testv32i16: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3 -; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6 @@ -388,23 +391,24 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm4 ; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm0 +; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm1 ; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpaddw %ymm0, %ymm3, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 -; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4 +; AVX512DQ-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3 +; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm5 ; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpaddw %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq %out = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %in, i1 0) ret <32 x i16> %out @@ -413,15 +417,17 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512CD-LABEL: testv32i16u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512CD-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512CD-NEXT: vpsubw %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv32i16u: @@ -460,9 +466,10 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; ; AVX512DQ-LABEL: testv32i16u: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3 -; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 +; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6 @@ -470,23 +477,24 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm4 ; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm0 +; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm1 ; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpaddw %ymm0, %ymm3, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 -; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4 +; AVX512DQ-NEXT: vpaddw %ymm1, %ymm3, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3 +; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4 ; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm5 ; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpaddw %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vpaddw %ymm0, %ymm2, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq %out = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %in, i1 -1) ret <32 x i16> %out @@ -495,27 +503,29 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512CD-LABEL: testv64i8: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero -; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2 -; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] -; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2 -; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512CD-NEXT: vpsubb %xmm3, %xmm0, %xmm0 -; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512CD-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2 ; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] ; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512CD-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero +; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2 +; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2 +; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512CD-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv64i8: @@ -560,23 +570,25 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; ; AVX512DQ-LABEL: testv64i8: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3 -; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm6 -; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm4 -; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm6 +; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3 +; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm4 +; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512DQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq %out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 0) ret <64 x i8> %out @@ -585,27 +597,29 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512CD-LABEL: testv64i8u: ; AVX512CD: # %bb.0: -; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero -; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2 -; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] -; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2 -; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512CD-NEXT: vpsubb %xmm3, %xmm0, %xmm0 -; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512CD-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2 ; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] ; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512CD-NEXT: vpsubb %xmm3, %xmm1, %xmm1 ; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero +; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2 +; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2 +; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 +; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512CD-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv64i8u: @@ -650,23 +664,25 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; ; AVX512DQ-LABEL: testv64i8u: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3 -; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm6 -; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm4 -; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm6 +; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm3, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3 +; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm4 +; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512DQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq %out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 -1) ret <64 x i8> %out diff --git a/test/CodeGen/X86/vector-popcnt-512.ll b/test/CodeGen/X86/vector-popcnt-512.ll index eae9e6c79bd..a4afda2977b 100644 --- a/test/CodeGen/X86/vector-popcnt-512.ll +++ b/test/CodeGen/X86/vector-popcnt-512.ll @@ -130,26 +130,28 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind { define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512F-LABEL: testv32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm3 -; AVX512F-NEXT: vpaddb %ymm0, %ymm3, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm2 -; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm2 +; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: testv32i16: @@ -169,12 +171,14 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: testv32i16: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: retq ; ; AVX512VPOPCNTDQ-BW-LABEL: testv32i16: @@ -203,20 +207,22 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512F-LABEL: testv64i8: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: testv64i8: @@ -233,20 +239,22 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQ-NOBW-LABEL: testv64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: +; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm3 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm1, %ymm4, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm3, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: retq ; ; AVX512VPOPCNTDQ-BW-LABEL: testv64i8: diff --git a/test/CodeGen/X86/vector-reduce-and-bool.ll b/test/CodeGen/X86/vector-reduce-and-bool.ll index d8ce3a81e94..2d5d13d4a6e 100644 --- a/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -763,6 +763,7 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) { ; ; AVX512F-LABEL: trunc_v32i16_v32i1: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 @@ -879,6 +880,7 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) { ; ; AVX512F-LABEL: trunc_v64i8_v64i1: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpand %xmm2, %xmm3, %xmm2 @@ -1699,6 +1701,7 @@ define i1 @icmp_v32i16_v32i1(<32 x i16>) { ; ; AVX512F-LABEL: icmp_v32i16_v32i1: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 @@ -1780,6 +1783,7 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>) { ; ; AVX512F-LABEL: icmp_v64i8_v64i1: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 diff --git a/test/CodeGen/X86/vector-reduce-mul.ll b/test/CodeGen/X86/vector-reduce-mul.ll index b70f964a542..cbf7d6a14f2 100644 --- a/test/CodeGen/X86/vector-reduce-mul.ll +++ b/test/CodeGen/X86/vector-reduce-mul.ll @@ -1353,6 +1353,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; ; AVX512DQ-LABEL: test_v32i16: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1369,6 +1370,7 @@ define i16 @test_v32i16(<32 x i16> %a0) { ; ; AVX512DQVL-LABEL: test_v32i16: ; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2421,6 +2423,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; ; AVX512DQ-LABEL: test_v64i8: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2 @@ -2463,6 +2466,7 @@ define i8 @test_v64i8(<64 x i8> %a0) { ; ; AVX512DQVL-LABEL: test_v64i8: ; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2 diff --git a/test/CodeGen/X86/vector-reduce-or-bool.ll b/test/CodeGen/X86/vector-reduce-or-bool.ll index b66e5148bdf..91157ee54a5 100644 --- a/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -754,6 +754,7 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) { ; ; AVX512F-LABEL: trunc_v32i16_v32i1: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 @@ -870,6 +871,7 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) { ; ; AVX512F-LABEL: trunc_v64i8_v64i1: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 @@ -1682,6 +1684,7 @@ define i1 @icmp_v32i16_v32i1(<32 x i16>) { ; ; AVX512F-LABEL: icmp_v32i16_v32i1: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 @@ -1769,6 +1772,7 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>) { ; ; AVX512F-LABEL: icmp_v64i8_v64i1: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 diff --git a/test/CodeGen/X86/vector-reduce-xor-bool.ll b/test/CodeGen/X86/vector-reduce-xor-bool.ll index 03861ffd261..36395a1b80b 100644 --- a/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -851,6 +851,7 @@ define i1 @trunc_v32i16_v32i1(<32 x i16>) { ; ; AVX512F-LABEL: trunc_v32i16_v32i1: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 @@ -979,6 +980,7 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) { ; ; AVX512F-LABEL: trunc_v64i8_v64i1: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpxor %xmm2, %xmm3, %xmm2 @@ -1888,6 +1890,7 @@ define i1 @icmp_v32i16_v32i1(<32 x i16>) { ; ; AVX512F-LABEL: icmp_v32i16_v32i1: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 @@ -2036,6 +2039,7 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>) { ; ; AVX512F-LABEL: icmp_v64i8_v64i1: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 diff --git a/test/CodeGen/X86/vector-rotate-512.ll b/test/CodeGen/X86/vector-rotate-512.ll index b7793ed6c92..2c3dbd8bb92 100644 --- a/test/CodeGen/X86/vector-rotate-512.ll +++ b/test/CodeGen/X86/vector-rotate-512.ll @@ -35,50 +35,56 @@ define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512F-LABEL: var_rotate_v32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpsllvd %zmm5, %zmm0, %zmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 +; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm0, %zmm5, %zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512F-NEXT: vpsllvd %zmm5, %zmm2, %zmm5 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2 +; AVX512F-NEXT: vpord %zmm2, %zmm5, %zmm2 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpsllvd %zmm3, %zmm1, %zmm3 -; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1 -; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_rotate_v32i16: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpsllvd %zmm5, %zmm0, %zmm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %ymm2, %ymm6, %ymm2 +; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpord %zmm0, %zmm5, %zmm0 -; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm2 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512VL-NEXT: vpsllvd %zmm5, %zmm2, %zmm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VL-NEXT: vpsubw %ymm3, %ymm6, %ymm3 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2 +; AVX512VL-NEXT: vpord %zmm2, %zmm5, %zmm2 +; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3 +; AVX512VL-NEXT: vpsubw %ymm1, %ymm6, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpsllvd %zmm3, %zmm1, %zmm3 -; AVX512VL-NEXT: vpsubw %ymm2, %ymm6, %ymm2 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero -; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1 -; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1 -; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0 +; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_rotate_v32i16: @@ -108,96 +114,102 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512F-LABEL: var_rotate_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6 ; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6 ; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4 ; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512F-NEXT: vpandn %ymm4, %ymm6, %ymm4 -; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm7 +; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm7 ; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm7 ; AVX512F-NEXT: vpor %ymm4, %ymm7, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm8 +; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm8 ; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2 -; AVX512F-NEXT: vpandn %ymm2, %ymm5, %ymm2 -; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm4 +; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512F-NEXT: vpandn %ymm3, %ymm5, %ymm3 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $6, %ymm1, %ymm2 -; AVX512F-NEXT: vpandn %ymm2, %ymm6, %ymm2 -; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm4 +; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3 +; AVX512F-NEXT: vpandn %ymm3, %ymm6, %ymm3 +; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 -; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2 -; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm4 -; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4 +; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: var_rotate_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpandn %ymm4, %ymm5, %ymm4 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6 +; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm6 ; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm6 ; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4 ; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512VL-NEXT: vpandn %ymm4, %ymm6, %ymm4 -; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm7 +; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm7 ; AVX512VL-NEXT: vpand %ymm6, %ymm7, %ymm7 ; AVX512VL-NEXT: vpor %ymm4, %ymm7, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm8 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8 ; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 -; AVX512VL-NEXT: vpandn %ymm2, %ymm5, %ymm2 -; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm4 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512VL-NEXT: vpandn %ymm3, %ymm5, %ymm3 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 ; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $6, %ymm1, %ymm2 -; AVX512VL-NEXT: vpandn %ymm2, %ymm6, %ymm2 -; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm4 +; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3 +; AVX512VL-NEXT: vpandn %ymm3, %ymm6, %ymm3 +; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2 -; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm4 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 +; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 +; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_rotate_v64i8: @@ -311,34 +323,38 @@ define <16 x i32> @splatvar_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512F-LABEL: splatvar_rotate_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; AVX512F-NEXT: vpsubw %xmm2, %xmm5, %xmm2 -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3 -; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_rotate_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpsubw %xmm2, %xmm5, %xmm2 -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0 -; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3 -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_rotate_v32i16: @@ -373,54 +389,58 @@ define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512F-LABEL: splatvar_rotate_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 ; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6 ; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6 ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512F-NEXT: vpsubb %xmm2, %xmm7, %xmm2 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw %xmm2, %xmm5, %xmm5 +; AVX512F-NEXT: vpsubb %xmm1, %xmm7, %xmm1 +; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 ; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5 ; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0 -; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3 +; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatvar_rotate_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4 ; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 ; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6 ; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6 ; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm2, %xmm7, %xmm2 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw %xmm2, %xmm5, %xmm5 +; AVX512VL-NEXT: vpsubb %xmm1, %xmm7, %xmm1 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 ; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5 ; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0 -; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_rotate_v64i8: @@ -497,24 +517,28 @@ define <16 x i32> @constant_rotate_v16i32(<16 x i32> %a) nounwind { define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: constant_rotate_v32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] -; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3 -; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3 +; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_rotate_v32i16: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] -; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3 -; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 ; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3 +; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_rotate_v32i16: @@ -539,12 +563,13 @@ define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind { define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind { ; AVX512F-LABEL: constant_rotate_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 @@ -554,44 +579,46 @@ define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 ; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31] +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] ; AVX512F-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9 ; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23] +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23] ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] ; AVX512F-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm9, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2 +; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm9, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 +; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 ; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31] +; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31] ; AVX512F-NEXT: vpmullw %ymm10, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23] -; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23] +; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: constant_rotate_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] ; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5 @@ -600,38 +627,39 @@ define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind { ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] ; AVX512VL-NEXT: # ymm9 = mem[0,1,0,1] ; AVX512VL-NEXT: vpmullw %ymm9, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128] ; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpackuswb %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2 +; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2 ; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2 ; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3 ; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512VL-NEXT: vpmullw %ymm9, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: constant_rotate_v64i8: @@ -724,22 +752,26 @@ define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind { define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: splatconstant_rotate_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2 -; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm2 ; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2 +; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_v32i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2 -; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm2 ; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_v32i16: @@ -764,32 +796,36 @@ define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind { define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind { ; AVX512F-LABEL: splatconstant_rotate_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_v64i8: @@ -850,28 +886,32 @@ define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind { define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] -; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm3 -; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $11, %ymm1, %ymm3 ; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm3 +; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] -; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm3 -; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $11, %ymm1, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 ; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm3 +; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16: @@ -902,38 +942,42 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind { define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512F-LABEL: splatconstant_rotate_mask_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] -; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm4 -; AVX512F-NEXT: vpandn %ymm4, %ymm3, %ymm4 ; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1 ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpor %ymm4, %ymm1, %ymm1 +; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512F-NEXT: vpandn %ymm4, %ymm3, %ymm4 +; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 -; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] -; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm4 -; AVX512VL-NEXT: vpandn %ymm4, %ymm3, %ymm4 ; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 ; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] ; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpandn %ymm4, %ymm3, %ymm4 +; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8: diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll index 6f0b59ef32d..81071907584 100644 --- a/test/CodeGen/X86/vector-sext.ll +++ b/test/CodeGen/X86/vector-sext.ll @@ -181,10 +181,10 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss ; ; AVX512F-LABEL: sext_32i8_to_32i16: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm2 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1 +; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: sext_32i8_to_32i16: @@ -3279,11 +3279,13 @@ define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind { ; ; AVX512F-LABEL: sext_32xi1_to_32xi8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq diff --git a/test/CodeGen/X86/vector-shift-ashr-512.ll b/test/CodeGen/X86/vector-shift-ashr-512.ll index e4f676cecd6..1bb62977ca9 100644 --- a/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -27,14 +27,17 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero +; AVX512DQ-NEXT: vpmovsxwd %ymm2, %zmm2 +; AVX512DQ-NEXT: vpsravd %zmm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512DQ-NEXT: vpsravd %zmm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1 -; AVX512DQ-NEXT: vpsravd %zmm2, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v32i16: @@ -48,9 +51,11 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512DQ-NEXT: vpsllw $5, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] ; AVX512DQ-NEXT: vpsraw $4, %ymm5, %ymm6 ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpsraw $2, %ymm5, %ymm6 @@ -60,21 +65,21 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-NEXT: vpaddw %ymm4, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpsrlw $8, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23] ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpsraw $4, %ymm0, %ymm5 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm5 -; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm5 -; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsllw $5, %ymm3, %ymm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512DQ-NEXT: vpsraw $4, %ymm2, %ymm5 +; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsraw $2, %ymm2, %ymm5 +; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsraw $1, %ymm2, %ymm5 +; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpackuswb %ymm4, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQ-NEXT: vpsraw $4, %ymm4, %ymm5 ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpsraw $2, %ymm4, %ymm5 @@ -84,18 +89,19 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 ; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512DQ-NEXT: vpsraw $4, %ymm1, %ymm4 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpsraw $2, %ymm1, %ymm4 -; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpsraw $1, %ymm1, %ymm4 -; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vpsraw $4, %ymm0, %ymm4 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm4 +; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm4 +; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v64i8: @@ -163,9 +169,11 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-NEXT: vpsraw %xmm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsraw %xmm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512DQ-NEXT: vpsraw %xmm1, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsraw %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v32i16: @@ -181,21 +189,23 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3 -; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] -; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpxor %ymm4, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpxor %ymm4, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsubb %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpxor %ymm4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpsubb %ymm4, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v64i8: @@ -242,13 +252,15 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpsravd %zmm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpsravd %zmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVX512DQ-NEXT: vpsravd %zmm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v32i16: @@ -262,28 +274,30 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQ-NEXT: vpsraw $8, %ymm2, %ymm2 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] ; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpsraw $8, %ymm0, %ymm0 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] -; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpmullw %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQ-NEXT: vpsraw $8, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQ-NEXT: vpsraw $8, %ymm1, %ymm1 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] +; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpmullw %ymm4, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; AVX512DQ-NEXT: vpsraw $8, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vpsraw $8, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmullw %ymm4, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v64i8: @@ -327,8 +341,10 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind { define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: splatconstant_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsraw $3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpsraw $3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsraw $3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v32i16: @@ -342,16 +358,18 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: splatconstant_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512DQ-NEXT: vpxor %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpsrlw $3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512DQ-NEXT: vpxor %ymm3, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpxor %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v64i8: @@ -369,9 +387,11 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { define <64 x i8> @ashr_const7_v64i8(<64 x i8> %a) { ; AVX512DQ-LABEL: ashr_const7_v64i8: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: ashr_const7_v64i8: diff --git a/test/CodeGen/X86/vector-shift-lshr-512.ll b/test/CodeGen/X86/vector-shift-lshr-512.ll index f32b56d6035..81ab84315ea 100644 --- a/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -27,14 +27,17 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v32i16: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512DQ-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512DQ-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v32i16: @@ -48,33 +51,36 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm4 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm4 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpsrlw $2, %ymm3, %ymm4 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512DQ-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrlw $1, %ymm0, %ymm4 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpsrlw $1, %ymm3, %ymm4 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512DQ-NEXT: vpand %ymm7, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpsllw $5, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpsrlw $2, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpsrlw $1, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpand %ymm7, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $1, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v64i8: @@ -127,9 +133,11 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v32i16: @@ -145,15 +153,17 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3 +; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v64i8: @@ -196,13 +206,15 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v32i16: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v32i16: @@ -216,25 +228,27 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v64i8: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256] ; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpmullw %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2] ; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpmullw %ymm5, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] -; AVX512DQ-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] ; AVX512DQ-NEXT: vpmullw %ymm5, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] +; AVX512DQ-NEXT: vpmullw %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] +; AVX512DQ-NEXT: vpmullw %ymm5, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v64i8: @@ -278,8 +292,10 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind { define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: splatconstant_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpsrlw $3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v32i16: @@ -293,11 +309,13 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: splatconstant_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] -; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpsrlw $3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v64i8: diff --git a/test/CodeGen/X86/vector-shift-shl-512.ll b/test/CodeGen/X86/vector-shift-shl-512.ll index f63e1ab8d94..b8af184486e 100644 --- a/test/CodeGen/X86/vector-shift-shl-512.ll +++ b/test/CodeGen/X86/vector-shift-shl-512.ll @@ -27,14 +27,17 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind { define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v32i16: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero +; AVX512DQ-NEXT: vpsllvd %zmm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpsllvd %zmm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512DQ-NEXT: vpsllvd %zmm2, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v32i16: @@ -48,30 +51,33 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: var_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm4 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQ-NEXT: vpsllw $4, %ymm3, %ymm4 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm4 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpsllw $2, %ymm3, %ymm4 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512DQ-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm4 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpaddb %ymm3, %ymm3, %ymm4 ; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsllw $4, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpsllw $5, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpsllw $2, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpaddb %ymm3, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: var_shift_v64i8: @@ -122,9 +128,11 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-NEXT: vpsllw %xmm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsllw %xmm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512DQ-NEXT: vpsllw %xmm1, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v32i16: @@ -140,14 +148,16 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512DQ-LABEL: splatvar_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpsllw %xmm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpsllw %xmm1, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpsllw %xmm2, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpsllw %xmm1, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3 +; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsllw %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatvar_shift_v64i8: @@ -189,9 +199,11 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind { define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v32i16: ; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] -; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v32i16: @@ -205,28 +217,30 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512DQ-NEXT: vpsllw $4, %ymm1, %ymm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] ; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] -; AVX512DQ-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpaddb %ymm4, %ymm4, %ymm6 -; AVX512DQ-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpaddb %ymm6, %ymm6, %ymm7 -; AVX512DQ-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsllw $4, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsllw $2, %ymm1, %ymm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512DQ-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpaddb %ymm4, %ymm4, %ymm6 ; AVX512DQ-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm2 +; AVX512DQ-NEXT: vpaddb %ymm6, %ymm6, %ymm7 ; AVX512DQ-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: constant_shift_v64i8: @@ -275,8 +289,10 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind { define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { ; AVX512DQ-LABEL: splatconstant_shift_v32i16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpsllw $3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v32i16: @@ -290,11 +306,13 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind { define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: splatconstant_shift_v64i8: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] -; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpsllw $3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_shift_v64i8: diff --git a/test/CodeGen/X86/vector-shuffle-512-v32.ll b/test/CodeGen/X86/vector-shuffle-512-v32.ll index a77099abedc..c350e7f6d27 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -8,7 +8,7 @@ define <32 x i16> @shuffle_v32i16(<32 x i16> %a) { ; KNL-LABEL: shuffle_v32i16: ; KNL: ## %bb.0: ; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 -; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16: @@ -24,7 +24,7 @@ define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_0 ; KNL: ## %bb.0: ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 -; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08: @@ -39,16 +39,18 @@ define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_0 define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f(<32 x i16> %a) { ; KNL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f: ; KNL: ## %bb.0: -; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19] -; KNL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31] +; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19] +; KNL-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31] +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u> -; KNL-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3 -; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7],ymm3[8,9,10,11,12,13,14],ymm1[15] -; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19] -; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; KNL-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm3 +; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7],ymm3[8,9,10,11,12,13,14],ymm0[15] +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] +; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19] ; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = <0,0,0,0,u,u,u,u,0,0,u,u,255,255,0,0,255,255,255,255,u,u,255,255,255,255,u,u,0,0,255,255> -; KNL-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0 +; KNL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f: @@ -63,19 +65,22 @@ define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_ define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38: ; KNL: ## %bb.0: -; KNL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; KNL-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7],ymm1[8,9,10,11],ymm2[12,13],ymm1[14],ymm2[15] -; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,u,u] +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; KNL-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] +; KNL-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6],ymm3[7],ymm2[8,9,10,11],ymm3[12,13],ymm2[14],ymm3[15] +; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,u,u] ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm4 ; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7],ymm0[8,9,10,11,12],ymm4[13,14,15] ; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17,u,u] -; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; KNL-NEXT: vextracti128 $1, %ymm3, %xmm3 -; KNL-NEXT: vpbroadcastw %xmm3, %ymm3 -; KNL-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5,6],ymm3[7],ymm1[8,9,10,11,12,13,14],ymm3[15] -; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; KNL-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7],ymm0[8],ymm3[9],ymm0[10],ymm3[11],ymm0[12],ymm3[13],ymm0[14],ymm3[15] +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; KNL-NEXT: vpbroadcastw %xmm1, %ymm1 +; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7],ymm3[8,9,10,11,12,13,14],ymm1[15] +; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17] ; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38: @@ -88,29 +93,19 @@ define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_1 } define <32 x i16> @shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u(<32 x i16> %a, <32 x i16> %b) { -; KNL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u: -; KNL: ## %bb.0: -; KNL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] -; KNL-NEXT: retq -; -; SKX-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u: -; SKX: ## %bb.0: -; SKX-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; SKX-NEXT: retq +; ALL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u: +; ALL: ## %bb.0: +; ALL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; ALL-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> ret <32 x i16> %c } define <32 x i16> @shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u(<32 x i16> %a, <32 x i16> %b) { -; KNL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u: -; KNL: ## %bb.0: -; KNL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] -; KNL-NEXT: retq -; -; SKX-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u: -; SKX: ## %bb.0: -; SKX-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; SKX-NEXT: retq +; ALL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u: +; ALL: ## %bb.0: +; ALL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; ALL-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> ret <32 x i16> %c } @@ -118,8 +113,10 @@ define <32 x i16> @shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u(<32 x define <32 x i16> @shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z: ; KNL: ## %bb.0: -; KNL-NEXT: vpsrld $16, %ymm0, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpsrld $16, %ymm1, %ymm1 +; KNL-NEXT: vpsrld $16, %ymm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z: @@ -133,8 +130,10 @@ define <32 x i16> @shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_2 define <32 x i16> @shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30: ; KNL: ## %bb.0: -; KNL-NEXT: vpslld $16, %ymm0, %ymm0 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpslld $16, %ymm1, %ymm1 +; KNL-NEXT: vpslld $16, %ymm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30: @@ -148,8 +147,10 @@ define <32 x i16> @shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z define <32 x i16> @shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31: ; KNL: ## %bb.0: -; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] +; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31: @@ -163,8 +164,10 @@ define <32 x i16> @shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_1 define <32 x i16> @shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28: ; KNL: ## %bb.0: -; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] +; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28: @@ -178,10 +181,12 @@ define <32 x i16> @shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18 define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) { ; KNL-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28: ; KNL: ## %bb.0: -; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] -; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] ; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] +; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] +; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28: @@ -198,7 +203,6 @@ define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a ; KNL-NEXT: movl $65535, %eax ## imm = 0xFFFF ; KNL-NEXT: vmovd %eax, %xmm1 ; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: @@ -215,7 +219,7 @@ define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) { ; KNL-LABEL: insert_dup_mem_v32i16_i32: ; KNL: ## %bb.0: ; KNL-NEXT: vpbroadcastw (%rdi), %ymm0 -; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_mem_v32i16_i32: @@ -233,7 +237,7 @@ define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) { ; KNL-LABEL: insert_dup_mem_v32i16_sext_i16: ; KNL: ## %bb.0: ; KNL-NEXT: vpbroadcastw (%rdi), %ymm0 -; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_mem_v32i16_sext_i16: @@ -252,7 +256,7 @@ define <32 x i16> @insert_dup_elt1_mem_v32i16_i32(i32* %ptr) #0 { ; KNL-LABEL: insert_dup_elt1_mem_v32i16_i32: ; KNL: ## %bb.0: ; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0 -; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_elt1_mem_v32i16_i32: @@ -270,7 +274,7 @@ define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(i32* %ptr) #0 { ; KNL-LABEL: insert_dup_elt3_mem_v32i16_i32: ; KNL: ## %bb.0: ; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0 -; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: insert_dup_elt3_mem_v32i16_i32: @@ -287,10 +291,10 @@ define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(i32* %ptr) #0 { define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz(<32 x i16> %a) { ; KNL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz: ; KNL: ## %bb.0: -; KNL-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; KNL-NEXT: vmovdqa %ymm2, %ymm0 +; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; KNL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz: @@ -304,10 +308,10 @@ define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_z define <32 x i16> @shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz(<32 x i16> %a) { ; KNL-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz: ; KNL: ## %bb.0: -; KNL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; KNL-NEXT: vmovdqa %ymm2, %ymm0 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz: @@ -321,6 +325,7 @@ define <32 x i16> @shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_z define <8 x i16> @pr32967(<32 x i16> %v) { ; KNL-LABEL: pr32967: ; KNL: ## %bb.0: +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2 ; KNL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] ; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] @@ -351,8 +356,10 @@ define <8 x i16> @pr32967(<32 x i16> %v) { define <32 x i16> @shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz(<32 x i16> %a) { ; KNL-LABEL: shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz: ; KNL: ## %bb.0: -; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15],zero,zero,ymm0[10,11],zero,zero,ymm0[6,7],zero,zero,ymm0[2,3],zero,zero,ymm0[30,31],zero,zero,ymm0[26,27],zero,zero,ymm0[22,23],zero,zero,ymm0[18,19],zero,zero +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[14,15],zero,zero,ymm1[10,11],zero,zero,ymm1[6,7],zero,zero,ymm1[2,3],zero,zero,ymm1[30,31],zero,zero,ymm1[26,27],zero,zero,ymm1[22,23],zero,zero,ymm1[20,21],zero,zero +; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15],zero,zero,ymm0[10,11],zero,zero,ymm0[6,7],zero,zero,ymm0[2,3],zero,zero,ymm0[30,31],zero,zero,ymm0[26,27],zero,zero,ymm0[22,23],zero,zero,ymm0[18,19],zero,zero +; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz: diff --git a/test/CodeGen/X86/vector-shuffle-512-v64.ll b/test/CodeGen/X86/vector-shuffle-512-v64.ll index 2f1c598826a..7e5847eb153 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -16,8 +16,10 @@ define <64 x i8> @shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_ define <64 x i8> @shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] +; AVX512F-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: @@ -27,8 +29,10 @@ define <64 x i8> @shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_ ; ; AVX512DQ-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] +; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: @@ -42,8 +46,10 @@ define <64 x i8> @shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_ define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero +; AVX512F-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz: @@ -53,8 +59,10 @@ define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_ ; ; AVX512DQ-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero +; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz: @@ -68,8 +76,11 @@ define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_ define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] -; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] +; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: @@ -79,8 +90,11 @@ define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_ ; ; AVX512DQ-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] -; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] +; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62: @@ -98,7 +112,6 @@ define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) { ; AVX512F-NEXT: movl $255, %eax ; AVX512F-NEXT: vmovd %eax, %xmm1 ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: @@ -111,7 +124,6 @@ define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) { ; AVX512DQ-NEXT: movl $255, %eax ; AVX512DQ-NEXT: vmovd %eax, %xmm1 ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: @@ -126,7 +138,7 @@ define <64 x i8> @shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512F-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: @@ -137,7 +149,7 @@ define <64 x i8> @shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX512DQ-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: @@ -151,12 +163,13 @@ define <64 x i8> @shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: @@ -167,12 +180,13 @@ define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_ ; ; AVX512DQ-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00: @@ -188,7 +202,7 @@ define <64 x i8> @insert_dup_mem_v64i8_i32(i32* %ptr) { ; AVX512F-LABEL: insert_dup_mem_v64i8_i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: insert_dup_mem_v64i8_i32: @@ -199,7 +213,7 @@ define <64 x i8> @insert_dup_mem_v64i8_i32(i32* %ptr) { ; AVX512DQ-LABEL: insert_dup_mem_v64i8_i32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: insert_dup_mem_v64i8_i32: @@ -217,7 +231,7 @@ define <64 x i8> @insert_dup_mem_v64i8_sext_i8(i8* %ptr) { ; AVX512F-LABEL: insert_dup_mem_v64i8_sext_i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: insert_dup_mem_v64i8_sext_i8: @@ -228,7 +242,7 @@ define <64 x i8> @insert_dup_mem_v64i8_sext_i8(i8* %ptr) { ; AVX512DQ-LABEL: insert_dup_mem_v64i8_sext_i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: insert_dup_mem_v64i8_sext_i8: @@ -247,7 +261,7 @@ define <64 x i8> @insert_dup_elt1_mem_v64i8_i32(i32* %ptr) { ; AVX512F-LABEL: insert_dup_elt1_mem_v64i8_i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb 1(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: insert_dup_elt1_mem_v64i8_i32: @@ -258,7 +272,7 @@ define <64 x i8> @insert_dup_elt1_mem_v64i8_i32(i32* %ptr) { ; AVX512DQ-LABEL: insert_dup_elt1_mem_v64i8_i32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb 1(%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: insert_dup_elt1_mem_v64i8_i32: @@ -276,7 +290,7 @@ define <64 x i8> @insert_dup_elt3_mem_v64i8_i32(i32* %ptr) { ; AVX512F-LABEL: insert_dup_elt3_mem_v64i8_i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpbroadcastb 3(%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: insert_dup_elt3_mem_v64i8_i32: @@ -287,7 +301,7 @@ define <64 x i8> @insert_dup_elt3_mem_v64i8_i32(i32* %ptr) { ; AVX512DQ-LABEL: insert_dup_elt3_mem_v64i8_i32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpbroadcastb 3(%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: insert_dup_elt3_mem_v64i8_i32: @@ -308,7 +322,7 @@ define <64 x i8> @insert_dup_elt1_mem_v64i8_sext_i8(i8* %ptr) { ; AVX512F-NEXT: shrl $8, %eax ; AVX512F-NEXT: vmovd %eax, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: insert_dup_elt1_mem_v64i8_sext_i8: @@ -324,7 +338,7 @@ define <64 x i8> @insert_dup_elt1_mem_v64i8_sext_i8(i8* %ptr) { ; AVX512DQ-NEXT: shrl $8, %eax ; AVX512DQ-NEXT: vmovd %eax, %xmm0 ; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: insert_dup_elt1_mem_v64i8_sext_i8: @@ -344,10 +358,10 @@ define <64 x i8> @insert_dup_elt1_mem_v64i8_sext_i8(i8* %ptr) { define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz: @@ -357,10 +371,10 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_ ; ; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz: @@ -374,10 +388,10 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz: @@ -387,10 +401,10 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_ ; ; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz: @@ -404,10 +418,10 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_ define <64 x i8> @shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz: @@ -417,10 +431,10 @@ define <64 x i8> @shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_ ; ; AVX512DQ-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz: @@ -434,12 +448,13 @@ define <64 x i8> @shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128] -; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128] +; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: @@ -450,12 +465,13 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_ ; ; AVX512DQ-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz: @@ -471,15 +487,17 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] -; AVX512F-NEXT: vpshufb %ymm5, %ymm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 -; AVX512F-NEXT: vpshufb %ymm5, %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] +; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: @@ -498,15 +516,17 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_ ; ; AVX512DQ-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm1, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm0, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: @@ -521,14 +541,17 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) { ; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpackuswb %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126: @@ -542,14 +565,17 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_ ; ; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm1 +; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126: @@ -568,12 +594,15 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) { ; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm1 +; AVX512F-NEXT: vpackuswb %ymm1, %ymm3, %ymm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126: @@ -585,12 +614,15 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_ ; ; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm1 +; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm3, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126: diff --git a/test/CodeGen/X86/vector-shuffle-avx512.ll b/test/CodeGen/X86/vector-shuffle-avx512.ll index 2092b3bf453..c3b6265ee98 100644 --- a/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -588,9 +588,12 @@ define <64 x i8> @test_mm512_mask_blend_epi8(<64 x i8> %A, <64 x i8> %W){ ; ; KNL64-LABEL: test_mm512_mask_blend_epi8: ; KNL64: # %bb.0: # %entry -; KNL64-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; KNL64-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 -; KNL64-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 +; KNL64-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; KNL64-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; KNL64-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; KNL64-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm3 +; KNL64-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; KNL64-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; KNL64-NEXT: retq ; ; SKX32-LABEL: test_mm512_mask_blend_epi8: @@ -603,19 +606,12 @@ define <64 x i8> @test_mm512_mask_blend_epi8(<64 x i8> %A, <64 x i8> %W){ ; ; KNL32-LABEL: test_mm512_mask_blend_epi8: ; KNL32: # %bb.0: # %entry -; KNL32-NEXT: pushl %ebp -; KNL32-NEXT: .cfi_def_cfa_offset 8 -; KNL32-NEXT: .cfi_offset %ebp, -8 -; KNL32-NEXT: movl %esp, %ebp -; KNL32-NEXT: .cfi_def_cfa_register %ebp -; KNL32-NEXT: andl $-32, %esp -; KNL32-NEXT: subl $32, %esp -; KNL32-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; KNL32-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0 -; KNL32-NEXT: vpblendvb %ymm3, 8(%ebp), %ymm1, %ymm1 -; KNL32-NEXT: movl %ebp, %esp -; KNL32-NEXT: popl %ebp -; KNL32-NEXT: .cfi_def_cfa %esp, 4 +; KNL32-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; KNL32-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; KNL32-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; KNL32-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm3 +; KNL32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; KNL32-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; KNL32-NEXT: retl entry: %0 = shufflevector <64 x i8> %A, <64 x i8> %W, <64 x i32> @@ -632,8 +628,11 @@ define <32 x i16> @test_mm512_mask_blend_epi16(<32 x i16> %A, <32 x i16> %W){ ; ; KNL64-LABEL: test_mm512_mask_blend_epi16: ; KNL64: # %bb.0: # %entry -; KNL64-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7],ymm2[8],ymm0[9],ymm2[10],ymm0[11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; KNL64-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7],ymm3[8],ymm1[9],ymm3[10],ymm1[11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] +; KNL64-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; KNL64-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; KNL64-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7],ymm3[8],ymm2[9],ymm3[10],ymm2[11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; KNL64-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; KNL64-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; KNL64-NEXT: retq ; ; SKX32-LABEL: test_mm512_mask_blend_epi16: @@ -645,18 +644,11 @@ define <32 x i16> @test_mm512_mask_blend_epi16(<32 x i16> %A, <32 x i16> %W){ ; ; KNL32-LABEL: test_mm512_mask_blend_epi16: ; KNL32: # %bb.0: # %entry -; KNL32-NEXT: pushl %ebp -; KNL32-NEXT: .cfi_def_cfa_offset 8 -; KNL32-NEXT: .cfi_offset %ebp, -8 -; KNL32-NEXT: movl %esp, %ebp -; KNL32-NEXT: .cfi_def_cfa_register %ebp -; KNL32-NEXT: andl $-32, %esp -; KNL32-NEXT: subl $32, %esp -; KNL32-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7],ymm2[8],ymm0[9],ymm2[10],ymm0[11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; KNL32-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15] -; KNL32-NEXT: movl %ebp, %esp -; KNL32-NEXT: popl %ebp -; KNL32-NEXT: .cfi_def_cfa %esp, 4 +; KNL32-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; KNL32-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; KNL32-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7],ymm3[8],ymm2[9],ymm3[10],ymm2[11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; KNL32-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; KNL32-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; KNL32-NEXT: retl entry: %0 = shufflevector <32 x i16> %A, <32 x i16> %W, <32 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll index d926e5427dd..94b00fbd937 100644 --- a/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/test/CodeGen/X86/vector-shuffle-v1.ll @@ -267,42 +267,50 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16(<32 x i16> %a, <32 x i16> %c, <32 x i16> %d) { ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 ; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm1, %ymm0 +; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm5, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] -; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 +; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm6 ; AVX512F-NEXT: vptestmd %zmm6, %zmm6, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovdw %zmm0, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm0 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 +; AVX512F-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm5 ; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm1, %ymm0 +; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm5, %ymm0 ; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512VL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} ; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] -; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 +; AVX512VL-NEXT: vpermi2d %zmm0, %zmm5, %zmm6 ; AVX512VL-NEXT: vptestmd %zmm6, %zmm6, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512VL-NEXT: vpmovdw %zmm0, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm0 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1 +; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: @@ -381,6 +389,8 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) { ; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm5 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} @@ -389,13 +399,16 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_ ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 ; AVX512F-NEXT: vptestmd %zmm6, %zmm6, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovdw %zmm0, %ymm1 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm0 -; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm1 +; AVX512F-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm3, %ymm5 ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} @@ -404,9 +417,10 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_ ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm6 ; AVX512VL-NEXT: vptestmd %zmm6, %zmm6, %k1 ; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512VL-NEXT: vpmovdw %zmm0, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm0 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1 +; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: diff --git a/test/CodeGen/X86/vector-trunc-packus.ll b/test/CodeGen/X86/vector-trunc-packus.ll index 01e68925744..a0306dc1cd4 100644 --- a/test/CodeGen/X86/vector-trunc-packus.ll +++ b/test/CodeGen/X86/vector-trunc-packus.ll @@ -3023,12 +3023,14 @@ define <32 x i8> @trunc_packus_v32i16_v32i8(<32 x i16> %a0) { ; ; AVX512F-LABEL: trunc_packus_v32i16_v32i8: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_packus_v32i16_v32i8: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512VL-NEXT: retq diff --git a/test/CodeGen/X86/vector-trunc-ssat.ll b/test/CodeGen/X86/vector-trunc-ssat.ll index 96ca65ce3c9..bb734bb8e32 100644 --- a/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/test/CodeGen/X86/vector-trunc-ssat.ll @@ -3000,12 +3000,14 @@ define <32 x i8> @trunc_ssat_v32i16_v32i8(<32 x i16> %a0) { ; ; AVX512F-LABEL: trunc_ssat_v32i16_v32i8: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_ssat_v32i16_v32i8: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512VL-NEXT: retq diff --git a/test/CodeGen/X86/vector-trunc-usat.ll b/test/CodeGen/X86/vector-trunc-usat.ll index 2648a0c29fa..fa03c8984ad 100644 --- a/test/CodeGen/X86/vector-trunc-usat.ll +++ b/test/CodeGen/X86/vector-trunc-usat.ll @@ -2392,6 +2392,7 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(<32 x i16> %a0) { ; ; AVX512F-LABEL: trunc_usat_v32i16_v32i8: ; AVX512F: # %bb.0: +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpminuw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpminuw %ymm2, %ymm0, %ymm0 @@ -2404,6 +2405,7 @@ define <32 x i8> @trunc_usat_v32i16_v32i8(<32 x i16> %a0) { ; ; AVX512VL-LABEL: trunc_usat_v32i16_v32i8: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512VL-NEXT: vpminuw %ymm2, %ymm1, %ymm1 ; AVX512VL-NEXT: vpminuw %ymm2, %ymm0, %ymm0 diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index b09d14e5e2b..f3bbc15c369 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -1317,6 +1317,7 @@ define void @trunc32i16_32i8(<32 x i16> %a) { ; ; AVX512F-LABEL: trunc32i16_32i8: ; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpmovdb %zmm1, (%rax) ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero @@ -1326,6 +1327,7 @@ define void @trunc32i16_32i8(<32 x i16> %a) { ; ; AVX512VL-LABEL: trunc32i16_32i8: ; AVX512VL: # %bb.0: # %entry +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VL-NEXT: vpmovdb %zmm1, (%rax) ; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll index 501d7e96835..83f0d25ee80 100644 --- a/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/test/CodeGen/X86/vector-tzcnt-512.ll @@ -260,31 +260,33 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind { define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512CD-LABEL: testv32i16: ; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm3 -; AVX512CD-NEXT: vpandn %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm4 +; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm4 -; AVX512CD-NEXT: vpaddb %ymm0, %ymm4, %ymm0 -; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm2 -; AVX512CD-NEXT: vpandn %ymm2, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm2 -; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm2 -; AVX512CD-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512CD-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm4 +; AVX512CD-NEXT: vpaddb %ymm1, %ymm4, %ymm1 ; AVX512CD-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm2 +; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm2 +; AVX512CD-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv32i16: @@ -325,17 +327,19 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv32i16: ; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; BITALG-LABEL: testv32i16: @@ -352,31 +356,33 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; AVX512CD-LABEL: testv32i16u: ; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm3 -; AVX512CD-NEXT: vpandn %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm4 +; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm4 -; AVX512CD-NEXT: vpaddb %ymm0, %ymm4, %ymm0 -; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm2 -; AVX512CD-NEXT: vpandn %ymm2, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm2 -; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1 -; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm2 -; AVX512CD-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512CD-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm4 +; AVX512CD-NEXT: vpaddb %ymm1, %ymm4, %ymm1 ; AVX512CD-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm2 +; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm2 +; AVX512CD-NEXT: vpaddb %ymm0, %ymm2, %ymm0 +; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv32i16u: @@ -417,17 +423,19 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv32i16u: ; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 +; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; BITALG-LABEL: testv32i16u: @@ -444,25 +452,27 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind { define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; AVX512CD-LABEL: testv64i8: ; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm3 -; AVX512CD-NEXT: vpandn %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm3 +; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm4 +; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm2 -; AVX512CD-NEXT: vpandn %ymm2, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm2 -; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm2 +; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm2 +; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv64i8: @@ -497,25 +507,27 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv64i8: ; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm4 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; BITALG-LABEL: testv64i8: @@ -532,25 +544,27 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind { define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; AVX512CD-LABEL: testv64i8u: ; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm3 -; AVX512CD-NEXT: vpandn %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm3 +; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm4 +; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm2 -; AVX512CD-NEXT: vpandn %ymm2, %ymm1, %ymm1 -; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm2 -; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm2 +; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm2 +; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512CD-NEXT: retq ; ; AVX512CDBW-LABEL: testv64i8u: @@ -585,25 +599,27 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind { ; ; AVX512VPOPCNTDQ-LABEL: testv64i8u: ; AVX512VPOPCNTDQ: # %bb.0: +; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm3 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm3 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm4 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 -; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm2 -; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: retq ; ; BITALG-LABEL: testv64i8u: diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll index a7bf74e0c42..cbf49c2306d 100644 --- a/test/CodeGen/X86/vector-zext.ll +++ b/test/CodeGen/X86/vector-zext.ll @@ -144,10 +144,10 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) { ; ; AVX512F-LABEL: zext_32i8_to_32i16: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: zext_32i8_to_32i16: diff --git a/test/CodeGen/X86/viabs.ll b/test/CodeGen/X86/viabs.ll index cbb6a9aec6e..4857b19eaee 100644 --- a/test/CodeGen/X86/viabs.ll +++ b/test/CodeGen/X86/viabs.ll @@ -929,8 +929,10 @@ define <64 x i8> @test_abs_lt_v64i8(<64 x i8> %a) nounwind { ; ; AVX512F-LABEL: test_abs_lt_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpabsb %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x1c,0xc0] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc1,0x01] ; AVX512F-NEXT: vpabsb %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x7d,0x1c,0xc9] +; AVX512F-NEXT: vpabsb %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x1c,0xc0] +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc1,0x01] ; AVX512F-NEXT: retq # encoding: [0xc3] ; ; AVX512BW-LABEL: test_abs_lt_v64i8: @@ -1000,8 +1002,10 @@ define <32 x i16> @test_abs_gt_v32i16(<32 x i16> %a) nounwind { ; ; AVX512F-LABEL: test_abs_gt_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpabsw %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x1d,0xc0] +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc1,0x01] ; AVX512F-NEXT: vpabsw %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x7d,0x1d,0xc9] +; AVX512F-NEXT: vpabsw %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x1d,0xc0] +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc1,0x01] ; AVX512F-NEXT: retq # encoding: [0xc3] ; ; AVX512BW-LABEL: test_abs_gt_v32i16: -- 2.40.0