now stored in the lower bits of an xmm register and the upper bits are
undefined. Previously the elements were spread apart with undefined bits in
between them.
+* v32i8 and v64i8 vectors with AVX512F enabled, but AVX512BW disabled will now
+ be passed in ZMM registers for calls and returns. Previously they were passed
+ in two YMM registers. Old behavior can be enabled by passing
+ -x86-enable-old-knl-abi
Changes to the AMDGPU Target
-----------------------------
" of the loop header PC will be 0)."),
cl::Hidden);
+// Added in 10.0.
+static cl::opt<bool> EnableOldKNLABI(
+ "x86-enable-old-knl-abi", cl::init(false),
+ cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of "
+ "one ZMM register on AVX512F, but not AVX512BW targets."),
+ cl::Hidden);
+
static cl::opt<bool> MulConstantOptimization(
"mul-constant-optimization", cl::init(true),
cl::desc("Replace 'mul x, Const' with more effective instructions like "
EVT VT) const {
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return MVT::v32i8;
+ // FIXME: Should we just make these types legal and custom split operations?
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
+ Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
+ return MVT::v16i32;
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
EVT VT) const {
if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
return 1;
+ // FIXME: Should we just make these types legal and custom split operations?
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
+ Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
+ return 1;
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}
; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX2-NEXT: retl
;
-; X32-KNL-LABEL: allones_v64i8:
-; X32-KNL: # %bb.0:
-; X32-KNL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; X32-KNL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; X32-KNL-NEXT: retl
-;
-; X32-SKX-LABEL: allones_v64i8:
-; X32-SKX: # %bb.0:
-; X32-SKX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; X32-SKX-NEXT: retl
+; X32-AVX512-LABEL: allones_v64i8:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; X32-AVX512-NEXT: retl
;
; X64-SSE-LABEL: allones_v64i8:
; X64-SSE: # %bb.0:
; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X64-AVX2-NEXT: retq
;
-; X64-KNL-LABEL: allones_v64i8:
-; X64-KNL: # %bb.0:
-; X64-KNL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; X64-KNL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; X64-KNL-NEXT: retq
-;
-; X64-SKX-LABEL: allones_v64i8:
-; X64-SKX: # %bb.0:
-; X64-SKX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; X64-SKX-NEXT: retq
+; X64-AVX512-LABEL: allones_v64i8:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
ret <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
}
; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX2-NEXT: retl
;
-; X32-KNL-LABEL: allones_v32i16:
-; X32-KNL: # %bb.0:
-; X32-KNL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; X32-KNL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; X32-KNL-NEXT: retl
-;
-; X32-SKX-LABEL: allones_v32i16:
-; X32-SKX: # %bb.0:
-; X32-SKX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; X32-SKX-NEXT: retl
+; X32-AVX512-LABEL: allones_v32i16:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; X32-AVX512-NEXT: retl
;
; X64-SSE-LABEL: allones_v32i16:
; X64-SSE: # %bb.0:
; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X64-AVX2-NEXT: retq
;
-; X64-KNL-LABEL: allones_v32i16:
-; X64-KNL: # %bb.0:
-; X64-KNL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; X64-KNL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; X64-KNL-NEXT: retq
-;
-; X64-SKX-LABEL: allones_v32i16:
-; X64-SKX: # %bb.0:
-; X64-SKX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; X64-SKX-NEXT: retq
+; X64-AVX512-LABEL: allones_v32i16:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
ret <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
}
define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 %mask) nounwind {
; AVX512F-LABEL: avg_v64i8_mask:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq %rdi, %rcx
+; AVX512F-NEXT: movl %edi, %ecx
; AVX512F-NEXT: kmovw %edi, %k1
-; AVX512F-NEXT: movl %edi, %edx
-; AVX512F-NEXT: shrl $16, %edx
-; AVX512F-NEXT: shrq $32, %rax
-; AVX512F-NEXT: shrq $48, %rcx
-; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: shrq $32, %rdi
+; AVX512F-NEXT: shrq $48, %rax
+; AVX512F-NEXT: shrl $16, %ecx
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
+; AVX512F-NEXT: vpavgb %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: kmovw %ecx, %k2
; AVX512F-NEXT: kmovw %eax, %k3
-; AVX512F-NEXT: kmovw %edx, %k4
-; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k4} {z}
-; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
-; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z}
-; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; AVX512F-NEXT: kmovw %edi, %k4
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z}
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z}
+; AVX512F-NEXT: vpmovdb %zmm5, %xmm5
+; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm1
+; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
-; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
+; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v64i8_mask:
; AVX512F-LABEL: avg_v64i8_maskz:
; AVX512F: # %bb.0:
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq %rdi, %rcx
+; AVX512F-NEXT: movl %edi, %ecx
; AVX512F-NEXT: kmovw %edi, %k1
-; AVX512F-NEXT: movl %edi, %edx
-; AVX512F-NEXT: shrl $16, %edx
-; AVX512F-NEXT: shrq $32, %rax
-; AVX512F-NEXT: shrq $48, %rcx
-; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: shrq $32, %rdi
+; AVX512F-NEXT: shrq $48, %rax
+; AVX512F-NEXT: shrl $16, %ecx
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpavgb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: kmovw %ecx, %k2
; AVX512F-NEXT: kmovw %eax, %k3
-; AVX512F-NEXT: kmovw %edx, %k4
-; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k4} {z}
+; AVX512F-NEXT: kmovw %edi, %k4
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z}
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
-; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k3} {z}
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v64i8_maskz:
define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src, i32 %mask) nounwind {
; AVX512F-LABEL: avg_v32i16_mask:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: shrl $16, %edi
-; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
+; AVX512F-NEXT: vpavgw %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: kmovw %edi, %k2
-; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
-; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm1
+; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i16_mask:
; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: shrl $16, %edi
-; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpavgw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: kmovw %edi, %k2
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
-; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
-; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i16_maskz:
;
; AVX512F-LABEL: avg_v64i8_3:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpavgb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpavgb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpavgb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v64i8_3:
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL --check-prefix=KNL-NEW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=knl -x86-enable-old-knl-abi | FileCheck %s --check-prefix=ALL_X64 --check-prefix=KNL --check-prefix=KNL-OLD
; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mcpu=skx | FileCheck %s --check-prefix=ALL_X64 --check-prefix=SKX
; RUN: llc < %s -mtriple=i686-apple-darwin9 -mcpu=knl | FileCheck %s --check-prefix=KNL_X32
%bar = load <1 x i1>, <1 x i1>* %foo
ret <1 x i1> %bar
}
+
+define void @test14(<32 x i16>* %x) {
+; KNL-NEW-LABEL: test14:
+; KNL-NEW: ## %bb.0:
+; KNL-NEW-NEXT: pushq %rbx
+; KNL-NEW-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEW-NEXT: .cfi_offset %rbx, -16
+; KNL-NEW-NEXT: movq %rdi, %rbx
+; KNL-NEW-NEXT: vmovaps (%rdi), %zmm0
+; KNL-NEW-NEXT: callq _test14_callee
+; KNL-NEW-NEXT: vmovaps %zmm0, (%rbx)
+; KNL-NEW-NEXT: popq %rbx
+; KNL-NEW-NEXT: retq
+;
+; KNL-OLD-LABEL: test14:
+; KNL-OLD: ## %bb.0:
+; KNL-OLD-NEXT: pushq %rbx
+; KNL-OLD-NEXT: .cfi_def_cfa_offset 16
+; KNL-OLD-NEXT: .cfi_offset %rbx, -16
+; KNL-OLD-NEXT: movq %rdi, %rbx
+; KNL-OLD-NEXT: vmovaps (%rdi), %ymm0
+; KNL-OLD-NEXT: vmovaps 32(%rdi), %ymm1
+; KNL-OLD-NEXT: callq _test14_callee
+; KNL-OLD-NEXT: vmovaps %ymm1, 32(%rbx)
+; KNL-OLD-NEXT: vmovaps %ymm0, (%rbx)
+; KNL-OLD-NEXT: popq %rbx
+; KNL-OLD-NEXT: retq
+;
+; SKX-LABEL: test14:
+; SKX: ## %bb.0:
+; SKX-NEXT: pushq %rbx
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: .cfi_offset %rbx, -16
+; SKX-NEXT: movq %rdi, %rbx
+; SKX-NEXT: vmovaps (%rdi), %zmm0
+; SKX-NEXT: callq _test14_callee
+; SKX-NEXT: vmovaps %zmm0, (%rbx)
+; SKX-NEXT: popq %rbx
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+;
+; KNL_X32-LABEL: test14:
+; KNL_X32: ## %bb.0:
+; KNL_X32-NEXT: pushl %esi
+; KNL_X32-NEXT: .cfi_def_cfa_offset 8
+; KNL_X32-NEXT: subl $8, %esp
+; KNL_X32-NEXT: .cfi_def_cfa_offset 16
+; KNL_X32-NEXT: .cfi_offset %esi, -8
+; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; KNL_X32-NEXT: vmovaps (%esi), %zmm0
+; KNL_X32-NEXT: calll _test14_callee
+; KNL_X32-NEXT: vmovaps %zmm0, (%esi)
+; KNL_X32-NEXT: addl $8, %esp
+; KNL_X32-NEXT: popl %esi
+; KNL_X32-NEXT: retl
+ %a = load <32 x i16>, <32 x i16>* %x
+ %b = call <32 x i16> @test14_callee(<32 x i16> %a)
+ store <32 x i16> %b, <32 x i16>* %x
+ ret void
+}
+declare <32 x i16> @test14_callee(<32 x i16>)
+
+define void @test15(<64 x i8>* %x) {
+; KNL-NEW-LABEL: test15:
+; KNL-NEW: ## %bb.0:
+; KNL-NEW-NEXT: pushq %rbx
+; KNL-NEW-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEW-NEXT: .cfi_offset %rbx, -16
+; KNL-NEW-NEXT: movq %rdi, %rbx
+; KNL-NEW-NEXT: vmovaps (%rdi), %zmm0
+; KNL-NEW-NEXT: callq _test15_callee
+; KNL-NEW-NEXT: vmovaps %zmm0, (%rbx)
+; KNL-NEW-NEXT: popq %rbx
+; KNL-NEW-NEXT: retq
+;
+; KNL-OLD-LABEL: test15:
+; KNL-OLD: ## %bb.0:
+; KNL-OLD-NEXT: pushq %rbx
+; KNL-OLD-NEXT: .cfi_def_cfa_offset 16
+; KNL-OLD-NEXT: .cfi_offset %rbx, -16
+; KNL-OLD-NEXT: movq %rdi, %rbx
+; KNL-OLD-NEXT: vmovaps (%rdi), %ymm0
+; KNL-OLD-NEXT: vmovaps 32(%rdi), %ymm1
+; KNL-OLD-NEXT: callq _test15_callee
+; KNL-OLD-NEXT: vmovaps %ymm1, 32(%rbx)
+; KNL-OLD-NEXT: vmovaps %ymm0, (%rbx)
+; KNL-OLD-NEXT: popq %rbx
+; KNL-OLD-NEXT: retq
+;
+; SKX-LABEL: test15:
+; SKX: ## %bb.0:
+; SKX-NEXT: pushq %rbx
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: .cfi_offset %rbx, -16
+; SKX-NEXT: movq %rdi, %rbx
+; SKX-NEXT: vmovaps (%rdi), %zmm0
+; SKX-NEXT: callq _test15_callee
+; SKX-NEXT: vmovaps %zmm0, (%rbx)
+; SKX-NEXT: popq %rbx
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+;
+; KNL_X32-LABEL: test15:
+; KNL_X32: ## %bb.0:
+; KNL_X32-NEXT: pushl %esi
+; KNL_X32-NEXT: .cfi_def_cfa_offset 8
+; KNL_X32-NEXT: subl $8, %esp
+; KNL_X32-NEXT: .cfi_def_cfa_offset 16
+; KNL_X32-NEXT: .cfi_offset %esi, -8
+; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; KNL_X32-NEXT: vmovaps (%esi), %zmm0
+; KNL_X32-NEXT: calll _test15_callee
+; KNL_X32-NEXT: vmovaps %zmm0, (%esi)
+; KNL_X32-NEXT: addl $8, %esp
+; KNL_X32-NEXT: popl %esi
+; KNL_X32-NEXT: retl
+ %a = load <64 x i8>, <64 x i8>* %x
+ %b = call <64 x i8> @test15_callee(<64 x i8> %a)
+ store <64 x i8> %b, <64 x i8>* %x
+ ret void
+}
+declare <64 x i8> @test15_callee(<64 x i8>)
define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_32x8mem_to_32x16:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_32x8mem_to_32x16:
;
; AVX512DQNOBW-LABEL: zext_32x8mem_to_32x16:
; AVX512DQNOBW: # %bb.0:
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1
; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQNOBW-NEXT: retq
%a = load <32 x i8>,<32 x i8> *%i,align 1
%x = zext <32 x i8> %a to <32 x i16>
define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_32x8mem_to_32x16:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT: vpmovsxbw 16(%rdi), %ymm2
-; KNL-NEXT: vpmovsxbw (%rdi), %ymm3
+; KNL-NEXT: vpmovsxbw (%rdi), %ymm2
+; KNL-NEXT: vpmovsxbw 16(%rdi), %ymm3
; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpand %ymm3, %ymm0, %ymm0
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: sext_32x8mem_to_32x16:
;
; AVX512DQNOBW-LABEL: sext_32x8mem_to_32x16:
; AVX512DQNOBW: # %bb.0:
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQNOBW-NEXT: vpmovsxbw 16(%rdi), %ymm2
-; AVX512DQNOBW-NEXT: vpmovsxbw (%rdi), %ymm3
+; AVX512DQNOBW-NEXT: vpmovsxbw (%rdi), %ymm2
+; AVX512DQNOBW-NEXT: vpmovsxbw 16(%rdi), %ymm3
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1
; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQNOBW-NEXT: retq
%a = load <32 x i8>,<32 x i8> *%i,align 1
%x = sext <32 x i8> %a to <32 x i16>
define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
; KNL-LABEL: zext_32x8_to_32x16:
; KNL: # %bb.0:
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; KNL-NEXT: vmovdqa %ymm2, %ymm0
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_32x8_to_32x16:
;
; AVX512DQNOBW-LABEL: zext_32x8_to_32x16:
; AVX512DQNOBW: # %bb.0:
-; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQNOBW-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQNOBW-NEXT: retq
%x = zext <32 x i8> %a to <32 x i16>
ret <32 x i16> %x
define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_32x8_to_32x16_mask:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm3
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
; KNL-NEXT: vpsllw $15, %ymm2, %ymm1
; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
; KNL-NEXT: vpand %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_32x8_to_32x16_mask:
;
; AVX512DQNOBW-LABEL: zext_32x8_to_32x16_mask:
; AVX512DQNOBW: # %bb.0:
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm2, %ymm1
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1
; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQNOBW-NEXT: retq
%x = zext <32 x i8> %a to <32 x i16>
%ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
; KNL-LABEL: sext_32x8_to_32x16:
; KNL: # %bb.0:
-; KNL-NEXT: vpmovsxbw %xmm0, %ymm2
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpmovsxbw %xmm0, %ymm1
-; KNL-NEXT: vmovdqa %ymm2, %ymm0
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpmovsxbw %xmm1, %ymm1
+; KNL-NEXT: vpmovsxbw %xmm0, %ymm0
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: sext_32x8_to_32x16:
;
; AVX512DQNOBW-LABEL: sext_32x8_to_32x16:
; AVX512DQNOBW: # %bb.0:
-; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm2
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm1
-; AVX512DQNOBW-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQNOBW-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQNOBW-NEXT: retq
%x = sext <32 x i8> %a to <32 x i16>
ret <32 x i16> %x
define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_32x8_to_32x16_mask:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm3
-; KNL-NEXT: vpmovsxbw %xmm3, %ymm3
+; KNL-NEXT: vpmovsxbw %xmm0, %ymm3
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbw %xmm0, %ymm0
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
; KNL-NEXT: vpsllw $15, %ymm2, %ymm1
; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
; KNL-NEXT: vpand %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: sext_32x8_to_32x16_mask:
;
; AVX512DQNOBW-LABEL: sext_32x8_to_32x16_mask:
; AVX512DQNOBW: # %bb.0:
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512DQNOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512DQNOBW-NEXT: vpmovsxbw %xmm3, %ymm3
+; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm3
+; AVX512DQNOBW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQNOBW-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm1, %ymm1
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1
; AVX512DQNOBW-NEXT: vpsllw $15, %ymm2, %ymm1
; AVX512DQNOBW-NEXT: vpsraw $15, %ymm1, %ymm1
; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQNOBW-NEXT: retq
%x = sext <32 x i8> %a to <32 x i16>
%ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 {
; KNL-LABEL: zext_64xi1_to_64xi8:
; KNL: # %bb.0:
-; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
-; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; KNL-NEXT: vpand %ymm2, %ymm0, %ymm0
-; KNL-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
-; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; KNL-NEXT: vpcmpeqb %ymm2, %ymm3, %ymm2
+; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; KNL-NEXT: vpand %ymm3, %ymm2, %ymm2
+; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpand %ymm3, %ymm0, %ymm0
+; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_64xi1_to_64xi8:
;
; AVX512DQNOBW-LABEL: zext_64xi1_to_64xi8:
; AVX512DQNOBW: # %bb.0:
-; AVX512DQNOBW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
-; AVX512DQNOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512DQNOBW-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
-; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQNOBW-NEXT: vpcmpeqb %ymm2, %ymm3, %ymm2
+; AVX512DQNOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512DQNOBW-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQNOBW-NEXT: retq
%mask = icmp eq <64 x i8> %x, %y
%1 = zext <64 x i1> %mask to <64 x i8>
define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 {
; KNL-LABEL: zext_32xi1_to_32xi16:
; KNL: # %bb.0:
-; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; KNL-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2
+; KNL-NEXT: vpsrlw $15, %ymm2, %ymm2
+; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0
-; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
-; KNL-NEXT: vpsrlw $15, %ymm1, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_32xi1_to_32xi16:
;
; AVX512DQNOBW-LABEL: zext_32xi1_to_32xi16:
; AVX512DQNOBW: # %bb.0:
-; AVX512DQNOBW-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQNOBW-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2
+; AVX512DQNOBW-NEXT: vpsrlw $15, %ymm2, %ymm2
+; AVX512DQNOBW-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpsrlw $15, %ymm0, %ymm0
-; AVX512DQNOBW-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
-; AVX512DQNOBW-NEXT: vpsrlw $15, %ymm1, %ymm1
+; AVX512DQNOBW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQNOBW-NEXT: retq
%mask = icmp eq <32 x i16> %x, %y
%1 = zext <32 x i1> %mask to <32 x i16>
define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 {
; KNL-LABEL: zext_32xi1_to_32xi8:
; KNL: # %bb.0:
-; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; KNL-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2
+; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
-; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
;
; AVX512DQNOBW-LABEL: zext_32xi1_to_32xi8:
; AVX512DQNOBW: # %bb.0:
-; AVX512DQNOBW-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQNOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQNOBW-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2
+; AVX512DQNOBW-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512DQNOBW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQNOBW-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
-; AVX512DQNOBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQNOBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
; AVX512DQNOBW-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQNOBW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQNOBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) {
; KNL-LABEL: insert_v32i16:
; KNL: ## %bb.0:
-; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm2
-; KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
-; KNL-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
+; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; KNL-NEXT: vpinsrw $1, %edi, %xmm2, %xmm2
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; KNL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
; KNL-NEXT: retq
;
; SKX-LABEL: insert_v32i16:
define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) {
; KNL-LABEL: insert_v64i8:
; KNL: ## %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm2
; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
; KNL-NEXT: vpinsrb $2, %edi, %xmm2, %xmm2
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: insert_v64i8:
define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: test_extractelement_v64i1:
; KNL: ## %bb.0:
-; KNL-NEXT: vextracti128 $1, %ymm3, %xmm0
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
-; KNL-NEXT: vpminub %xmm0, %xmm1, %xmm0
-; KNL-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpminub %xmm1, %xmm0, %xmm1
+; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: extractelement_v64i1_alt:
; KNL: ## %bb.0:
-; KNL-NEXT: vextracti128 $1, %ymm3, %xmm0
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
-; KNL-NEXT: vpminub %xmm0, %xmm1, %xmm0
-; KNL-NEXT: vpcmpeqb %xmm0, %xmm1, %xmm0
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpminub %xmm1, %xmm0, %xmm1
+; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
-; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovaps %ymm0, (%rsp)
; KNL-NEXT: andl $31, %edi
; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
-; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovaps %ymm0, (%rsp)
; KNL-NEXT: andl $63, %edi
; KNL-NEXT: movb (%rsp,%rdi), %al
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
; KNL-NEXT: addb %dil, %dil
-; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovaps %ymm0, (%rsp)
; KNL-NEXT: movzbl %dil, %eax
; KNL-NEXT: andl $63, %eax
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
define <64 x i8> @and_v64i8(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: and_v64i8:
; KNL: ## %bb.0:
-; KNL-NEXT: vandps %ymm2, %ymm0, %ymm0
-; KNL-NEXT: vandps %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vpandd %zmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: and_v64i8:
define <64 x i8> @andn_v64i8(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: andn_v64i8:
; KNL: ## %bb.0:
-; KNL-NEXT: vandnps %ymm0, %ymm2, %ymm0
-; KNL-NEXT: vandnps %ymm1, %ymm3, %ymm1
+; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; KNL-NEXT: vandnps %ymm2, %ymm3, %ymm2
+; KNL-NEXT: vandnps %ymm0, %ymm1, %ymm0
+; KNL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: andn_v64i8:
define <64 x i8> @or_v64i8(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: or_v64i8:
; KNL: ## %bb.0:
-; KNL-NEXT: vorps %ymm2, %ymm0, %ymm0
-; KNL-NEXT: vorps %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vpord %zmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: or_v64i8:
define <64 x i8> @xor_v64i8(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: xor_v64i8:
; KNL: ## %bb.0:
-; KNL-NEXT: vxorps %ymm2, %ymm0, %ymm0
-; KNL-NEXT: vxorps %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vpxord %zmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: xor_v64i8:
define <32 x i16> @and_v32i16(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: and_v32i16:
; KNL: ## %bb.0:
-; KNL-NEXT: vandps %ymm2, %ymm0, %ymm0
-; KNL-NEXT: vandps %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vpandd %zmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: and_v32i16:
define <32 x i16> @andn_v32i16(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: andn_v32i16:
; KNL: ## %bb.0:
-; KNL-NEXT: vandnps %ymm0, %ymm2, %ymm0
-; KNL-NEXT: vandnps %ymm1, %ymm3, %ymm1
+; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; KNL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; KNL-NEXT: vandnps %ymm2, %ymm3, %ymm2
+; KNL-NEXT: vandnps %ymm0, %ymm1, %ymm0
+; KNL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: andn_v32i16:
define <32 x i16> @or_v32i16(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: or_v32i16:
; KNL: ## %bb.0:
-; KNL-NEXT: vorps %ymm2, %ymm0, %ymm0
-; KNL-NEXT: vorps %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vpord %zmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: or_v32i16:
define <32 x i16> @xor_v32i16(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: xor_v32i16:
; KNL: ## %bb.0:
-; KNL-NEXT: vxorps %ymm2, %ymm0, %ymm0
-; KNL-NEXT: vxorps %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vpxord %zmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: xor_v32i16:
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
-; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
-; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z}
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm2, %xmm2
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test16:
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2
; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test16:
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
-; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
-; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z}
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm2, %xmm2
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test17:
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2
; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test17:
define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
; KNL-LABEL: test21:
; KNL: ## %bb.0:
-; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
+; KNL-NEXT: vpsraw $15, %ymm1, %ymm1
+; KNL-NEXT: vpand %ymm3, %ymm1, %ymm1
; KNL-NEXT: vpsllw $15, %ymm2, %ymm2
; KNL-NEXT: vpsraw $15, %ymm2, %ymm2
; KNL-NEXT: vpand %ymm0, %ymm2, %ymm0
-; KNL-NEXT: vpsllw $15, %ymm3, %ymm2
-; KNL-NEXT: vpsraw $15, %ymm2, %ymm2
-; KNL-NEXT: vpand %ymm1, %ymm2, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test21:
;
; AVX512DQ-LABEL: test21:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQ-NEXT: vpsllw $15, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpsraw $15, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsllw $15, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsraw $15, %ymm2, %ymm2
; AVX512DQ-NEXT: vpand %ymm0, %ymm2, %ymm0
-; AVX512DQ-NEXT: vpsllw $15, %ymm3, %ymm2
-; AVX512DQ-NEXT: vpsraw $15, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test21:
define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) {
; KNL-LABEL: test_build_vec_v32i1:
; KNL: ## %bb.0:
-; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_build_vec_v32i1:
;
; AVX512DQ-LABEL: test_build_vec_v32i1:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_build_vec_v32i1:
define <32 x i16> @test_build_vec_v32i1_optsize(<32 x i16> %x) optsize {
; KNL-LABEL: test_build_vec_v32i1_optsize:
; KNL: ## %bb.0:
-; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_build_vec_v32i1_optsize:
;
; AVX512DQ-LABEL: test_build_vec_v32i1_optsize:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_build_vec_v32i1_optsize:
define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {
; KNL-LABEL: test_build_vec_v64i1:
; KNL: ## %bb.0:
-; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_build_vec_v64i1:
;
; AVX512DQ-LABEL: test_build_vec_v64i1:
; AVX512DQ: ## %bb.0:
-; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: test_build_vec_v64i1:
; KNL: ## %bb.0:
; KNL-NEXT: kmovw (%rdi), %k1
; KNL-NEXT: kmovw 2(%rdi), %k2
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdw %zmm1, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: load_32i1:
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: kmovw 2(%rdi), %k1
-; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: load_32i1:
; KNL-NEXT: kmovw 2(%rdi), %k2
; KNL-NEXT: kmovw 4(%rdi), %k3
; KNL-NEXT: kmovw 6(%rdi), %k4
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k4} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z}
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm1, %xmm1
-; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k4} {z}
+; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
; KNL-NEXT: vpmovdb %zmm2, %xmm2
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: load_64i1:
; AVX512DQ-NEXT: kmovw 2(%rdi), %k1
; AVX512DQ-NEXT: kmovw 4(%rdi), %k2
; AVX512DQ-NEXT: kmovw 6(%rdi), %k3
-; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT: vpmovm2d %k2, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1
+; AVX512DQ-NEXT: vpmovm2d %k3, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512DQ-NEXT: vpmovm2d %k3, %zmm2
+; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2
; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; X86-LABEL: load_64i1:
define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
; KNL-LABEL: store_32i1_1:
; KNL: ## %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
;
; AVX512DQ-LABEL: store_32i1_1:
; AVX512DQ: ## %bb.0:
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rax
; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm5
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm7
; KNL-NEXT: vpxor %xmm8, %xmm8, %xmm8
; KNL-NEXT: vpcmpeqw %ymm8, %ymm0, %ymm0
+; KNL-NEXT: vpcmpeqw %ymm8, %ymm7, %ymm7
; KNL-NEXT: vpcmpeqw %ymm8, %ymm1, %ymm1
+; KNL-NEXT: vpor %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpcmpeqw %ymm8, %ymm6, %ymm1
+; KNL-NEXT: vpor %ymm1, %ymm7, %ymm1
; KNL-NEXT: vpcmpeqw %ymm8, %ymm2, %ymm2
-; KNL-NEXT: vpor %ymm2, %ymm0, %ymm0
-; KNL-NEXT: vpcmpeqw %ymm8, %ymm3, %ymm2
-; KNL-NEXT: vpor %ymm2, %ymm1, %ymm1
-; KNL-NEXT: vpcmpeqw %ymm8, %ymm4, %ymm2
-; KNL-NEXT: vpcmpeqw %ymm8, %ymm5, %ymm3
-; KNL-NEXT: vpcmpeqw %ymm8, %ymm6, %ymm4
-; KNL-NEXT: vpor %ymm4, %ymm2, %ymm2
+; KNL-NEXT: vpcmpeqw %ymm8, %ymm5, %ymm5
+; KNL-NEXT: vpcmpeqw %ymm8, %ymm3, %ymm3
+; KNL-NEXT: vpor %ymm3, %ymm2, %ymm2
; KNL-NEXT: vpand %ymm2, %ymm0, %ymm0
-; KNL-NEXT: vpcmpeqw %ymm8, %ymm7, %ymm2
-; KNL-NEXT: vpor %ymm2, %ymm3, %ymm2
+; KNL-NEXT: vpcmpeqw %ymm8, %ymm4, %ymm2
+; KNL-NEXT: vpor %ymm2, %ymm5, %ymm2
; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: pushq %rax
; AVX512DQ-NEXT: .cfi_def_cfa_offset 16
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm5
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm7
; AVX512DQ-NEXT: vpxor %xmm8, %xmm8, %xmm8
; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm7, %ymm7
; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm6, %ymm1
+; AVX512DQ-NEXT: vpor %ymm1, %ymm7, %ymm1
; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm3, %ymm2
-; AVX512DQ-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm4, %ymm2
-; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm5, %ymm3
-; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm6, %ymm4
-; AVX512DQ-NEXT: vpor %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm5, %ymm5
+; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpor %ymm3, %ymm2, %ymm2
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm7, %ymm2
-; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT: vpcmpeqw %ymm8, %ymm4, %ymm2
+; AVX512DQ-NEXT: vpor %ymm2, %ymm5, %ymm2
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; KNL: ## %bb.0:
; KNL-NEXT: pushq %rax
; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: vextracti64x4 $1, %zmm3, %ymm9
+; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm10
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm11
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm7
; KNL-NEXT: vpxor %xmm8, %xmm8, %xmm8
-; KNL-NEXT: vpcmpeqb %ymm8, %ymm0, %ymm9
-; KNL-NEXT: vextracti128 $1, %ymm9, %xmm0
-; KNL-NEXT: vpcmpeqb %ymm8, %ymm1, %ymm10
-; KNL-NEXT: vextracti128 $1, %ymm10, %xmm1
-; KNL-NEXT: vpcmpeqb %ymm8, %ymm2, %ymm11
-; KNL-NEXT: vextracti128 $1, %ymm11, %xmm2
-; KNL-NEXT: vpor %xmm2, %xmm0, %xmm13
-; KNL-NEXT: vpcmpeqb %ymm8, %ymm3, %ymm2
-; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3
-; KNL-NEXT: vpor %xmm3, %xmm1, %xmm12
-; KNL-NEXT: vpcmpeqb %ymm8, %ymm4, %ymm3
-; KNL-NEXT: vextracti128 $1, %ymm3, %xmm4
-; KNL-NEXT: vpcmpeqb %ymm8, %ymm5, %ymm5
-; KNL-NEXT: vextracti128 $1, %ymm5, %xmm1
-; KNL-NEXT: vpcmpeqb %ymm8, %ymm6, %ymm6
-; KNL-NEXT: vextracti128 $1, %ymm6, %xmm0
+; KNL-NEXT: vpcmpeqb %ymm8, %ymm0, %ymm13
+; KNL-NEXT: vextracti128 $1, %ymm13, %xmm4
+; KNL-NEXT: vpcmpeqb %ymm8, %ymm7, %ymm7
+; KNL-NEXT: vextracti128 $1, %ymm7, %xmm5
+; KNL-NEXT: vpcmpeqb %ymm8, %ymm1, %ymm1
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm6
+; KNL-NEXT: vpor %xmm6, %xmm4, %xmm12
+; KNL-NEXT: vpcmpeqb %ymm8, %ymm11, %ymm6
+; KNL-NEXT: vextracti128 $1, %ymm6, %xmm4
+; KNL-NEXT: vpor %xmm4, %xmm5, %xmm11
+; KNL-NEXT: vpcmpeqb %ymm8, %ymm2, %ymm2
+; KNL-NEXT: vextracti128 $1, %ymm2, %xmm5
+; KNL-NEXT: vpcmpeqb %ymm8, %ymm10, %ymm10
+; KNL-NEXT: vextracti128 $1, %ymm10, %xmm4
+; KNL-NEXT: vpcmpeqb %ymm8, %ymm3, %ymm3
+; KNL-NEXT: vextracti128 $1, %ymm3, %xmm0
+; KNL-NEXT: vpor %xmm0, %xmm5, %xmm0
+; KNL-NEXT: vpand %xmm0, %xmm12, %xmm12
+; KNL-NEXT: vpcmpeqb %ymm8, %ymm9, %ymm5
+; KNL-NEXT: vextracti128 $1, %ymm5, %xmm0
; KNL-NEXT: vpor %xmm0, %xmm4, %xmm0
-; KNL-NEXT: vpand %xmm0, %xmm13, %xmm0
-; KNL-NEXT: vpcmpeqb %ymm8, %ymm7, %ymm4
-; KNL-NEXT: vextracti128 $1, %ymm4, %xmm7
-; KNL-NEXT: vpor %xmm7, %xmm1, %xmm1
-; KNL-NEXT: vpand %xmm1, %xmm12, %xmm1
-; KNL-NEXT: vpor %xmm2, %xmm10, %xmm2
-; KNL-NEXT: vpor %xmm11, %xmm9, %xmm7
-; KNL-NEXT: vpor %xmm4, %xmm5, %xmm4
-; KNL-NEXT: vpand %xmm4, %xmm2, %xmm2
-; KNL-NEXT: vpor %xmm6, %xmm3, %xmm3
-; KNL-NEXT: vpand %xmm3, %xmm7, %xmm3
-; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
-; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0
+; KNL-NEXT: vpand %xmm0, %xmm11, %xmm0
+; KNL-NEXT: vpor %xmm6, %xmm7, %xmm4
+; KNL-NEXT: vpor %xmm1, %xmm13, %xmm1
+; KNL-NEXT: vpor %xmm5, %xmm10, %xmm5
+; KNL-NEXT: vpand %xmm5, %xmm4, %xmm4
+; KNL-NEXT: vpor %xmm3, %xmm2, %xmm2
+; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: vpmovsxbd %xmm12, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: orl %eax, %ecx
-; KNL-NEXT: vpmovsxbd %xmm2, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: vpmovsxbd %xmm4, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: shll $16, %edx
; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: pushq %rax
; AVX512DQ-NEXT: .cfi_def_cfa_offset 16
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm9
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm10
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm11
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm7
; AVX512DQ-NEXT: vpxor %xmm8, %xmm8, %xmm8
-; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm0, %ymm9
-; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm0
-; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm1, %ymm10
-; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm1
-; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm2, %ymm11
-; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm2
-; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm13
-; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm3, %ymm2
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm12
-; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm4, %ymm3
-; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm5, %ymm5
-; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm1
-; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm6, %ymm6
-; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm0
+; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm0, %ymm13
+; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm4
+; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm7, %ymm7
+; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm5
+; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm1, %ymm1
+; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm6
+; AVX512DQ-NEXT: vpor %xmm6, %xmm4, %xmm12
+; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm11, %ymm6
+; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm4
+; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm11
+; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm2, %ymm2
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm5
+; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm10, %ymm10
+; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm4
+; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm3, %ymm3
+; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm0
+; AVX512DQ-NEXT: vpor %xmm0, %xmm5, %xmm0
+; AVX512DQ-NEXT: vpand %xmm0, %xmm12, %xmm12
+; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm9, %ymm5
+; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm0
; AVX512DQ-NEXT: vpor %xmm0, %xmm4, %xmm0
-; AVX512DQ-NEXT: vpand %xmm0, %xmm13, %xmm0
-; AVX512DQ-NEXT: vpcmpeqb %ymm8, %ymm7, %ymm4
-; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm7
-; AVX512DQ-NEXT: vpor %xmm7, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpand %xmm1, %xmm12, %xmm1
-; AVX512DQ-NEXT: vpor %xmm2, %xmm10, %xmm2
-; AVX512DQ-NEXT: vpor %xmm11, %xmm9, %xmm7
-; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX512DQ-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpor %xmm6, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpand %xmm3, %xmm7, %xmm3
-; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3
-; AVX512DQ-NEXT: vpmovd2m %zmm3, %k0
+; AVX512DQ-NEXT: vpand %xmm0, %xmm11, %xmm0
+; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm4
+; AVX512DQ-NEXT: vpor %xmm1, %xmm13, %xmm1
+; AVX512DQ-NEXT: vpor %xmm5, %xmm10, %xmm5
+; AVX512DQ-NEXT: vpand %xmm5, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
+; AVX512DQ-NEXT: vpmovsxbd %xmm12, %zmm1
+; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT: kmovw %k0, %ecx
; AVX512DQ-NEXT: shll $16, %ecx
; AVX512DQ-NEXT: orl %eax, %ecx
-; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm0
-; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
+; AVX512DQ-NEXT: vpmovsxbd %xmm4, %zmm1
+; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm0
+; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: kmovw %k0, %edx
; AVX512DQ-NEXT: shll $16, %edx
define <32 x i16> @pr42355_v32i16(i1 %c, <32 x i16> %x, <32 x i16> %y) {
; X86-AVX512F-LABEL: pr42355_v32i16:
; X86-AVX512F: # %bb.0:
-; X86-AVX512F-NEXT: pushl %ebp
-; X86-AVX512F-NEXT: .cfi_def_cfa_offset 8
-; X86-AVX512F-NEXT: .cfi_offset %ebp, -8
-; X86-AVX512F-NEXT: movl %esp, %ebp
-; X86-AVX512F-NEXT: .cfi_def_cfa_register %ebp
-; X86-AVX512F-NEXT: andl $-32, %esp
-; X86-AVX512F-NEXT: subl $32, %esp
-; X86-AVX512F-NEXT: testb $1, 8(%ebp)
-; X86-AVX512F-NEXT: jne .LBB14_2
-; X86-AVX512F-NEXT: # %bb.1:
-; X86-AVX512F-NEXT: vmovaps 40(%ebp), %ymm1
-; X86-AVX512F-NEXT: vmovaps %ymm2, %ymm0
-; X86-AVX512F-NEXT: .LBB14_2:
-; X86-AVX512F-NEXT: movl %ebp, %esp
-; X86-AVX512F-NEXT: popl %ebp
-; X86-AVX512F-NEXT: .cfi_def_cfa %esp, 4
+; X86-AVX512F-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-AVX512F-NEXT: jne .LBB14_1
+; X86-AVX512F-NEXT: # %bb.2:
+; X86-AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; X86-AVX512F-NEXT: vmovaps %ymm1, %ymm0
+; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; X86-AVX512F-NEXT: retl
+; X86-AVX512F-NEXT: .LBB14_1:
+; X86-AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; X86-AVX512F-NEXT: retl
;
; X64-AVX512F-LABEL: pr42355_v32i16:
; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: testb $1, %dil
-; X64-AVX512F-NEXT: jne .LBB14_2
-; X64-AVX512F-NEXT: # %bb.1:
-; X64-AVX512F-NEXT: vmovaps %ymm2, %ymm0
-; X64-AVX512F-NEXT: vmovaps %ymm3, %ymm1
-; X64-AVX512F-NEXT: .LBB14_2:
+; X64-AVX512F-NEXT: jne .LBB14_1
+; X64-AVX512F-NEXT: # %bb.2:
+; X64-AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; X64-AVX512F-NEXT: vmovaps %ymm1, %ymm0
+; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; X64-AVX512F-NEXT: retq
+; X64-AVX512F-NEXT: .LBB14_1:
+; X64-AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; X64-AVX512F-NEXT: retq
;
; X86-AVX512BW-LABEL: pr42355_v32i16:
define <64 x i8> @pr42355_v64i8(i1 %c, <64 x i8> %x, <64 x i8> %y) {
; X86-AVX512F-LABEL: pr42355_v64i8:
; X86-AVX512F: # %bb.0:
-; X86-AVX512F-NEXT: pushl %ebp
-; X86-AVX512F-NEXT: .cfi_def_cfa_offset 8
-; X86-AVX512F-NEXT: .cfi_offset %ebp, -8
-; X86-AVX512F-NEXT: movl %esp, %ebp
-; X86-AVX512F-NEXT: .cfi_def_cfa_register %ebp
-; X86-AVX512F-NEXT: andl $-32, %esp
-; X86-AVX512F-NEXT: subl $32, %esp
-; X86-AVX512F-NEXT: testb $1, 8(%ebp)
-; X86-AVX512F-NEXT: jne .LBB15_2
-; X86-AVX512F-NEXT: # %bb.1:
-; X86-AVX512F-NEXT: vmovaps 40(%ebp), %ymm1
-; X86-AVX512F-NEXT: vmovaps %ymm2, %ymm0
-; X86-AVX512F-NEXT: .LBB15_2:
-; X86-AVX512F-NEXT: movl %ebp, %esp
-; X86-AVX512F-NEXT: popl %ebp
-; X86-AVX512F-NEXT: .cfi_def_cfa %esp, 4
+; X86-AVX512F-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-AVX512F-NEXT: jne .LBB15_1
+; X86-AVX512F-NEXT: # %bb.2:
+; X86-AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; X86-AVX512F-NEXT: vmovaps %ymm1, %ymm0
+; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; X86-AVX512F-NEXT: retl
+; X86-AVX512F-NEXT: .LBB15_1:
+; X86-AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; X86-AVX512F-NEXT: retl
;
; X64-AVX512F-LABEL: pr42355_v64i8:
; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: testb $1, %dil
-; X64-AVX512F-NEXT: jne .LBB15_2
-; X64-AVX512F-NEXT: # %bb.1:
-; X64-AVX512F-NEXT: vmovaps %ymm2, %ymm0
-; X64-AVX512F-NEXT: vmovaps %ymm3, %ymm1
-; X64-AVX512F-NEXT: .LBB15_2:
+; X64-AVX512F-NEXT: jne .LBB15_1
+; X64-AVX512F-NEXT: # %bb.2:
+; X64-AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; X64-AVX512F-NEXT: vmovaps %ymm1, %ymm0
+; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; X64-AVX512F-NEXT: retq
+; X64-AVX512F-NEXT: .LBB15_1:
+; X64-AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; X64-AVX512F-NEXT: retq
;
; X86-AVX512BW-LABEL: pr42355_v64i8:
define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 {
; KNL-LABEL: trunc_wb_512:
; KNL: ## %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 {
; KNL-LABEL: trunc_wb_512_mem:
; KNL: ## %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; KNL-NEXT: vpmovdb %zmm1, 16(%rdi)
; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512F-LABEL: _invec32xi8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: _invec32xi8:
; AVX512F-LABEL: _invec16xi16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: _invec16xi16:
define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
; X64-AVX512VL-LABEL: test_broadcast_8i16_32i16:
; X64-AVX512VL: ## %bb.0:
-; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm0
-; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1
+; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1
+; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; X64-AVX512VL-NEXT: retq
;
; X64-AVX512BWVL-LABEL: test_broadcast_8i16_32i16:
;
; X64-AVX512DQVL-LABEL: test_broadcast_8i16_32i16:
; X64-AVX512DQVL: ## %bb.0:
-; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm0
-; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1
+; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1
+; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; X64-AVX512DQVL-NEXT: retq
%1 = load <8 x i16>, <8 x i16> *%p
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
; X64-AVX512VL-LABEL: test_broadcast_16i8_64i8:
; X64-AVX512VL: ## %bb.0:
-; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm0
-; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm1
+; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1
+; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; X64-AVX512VL-NEXT: retq
;
; X64-AVX512BWVL-LABEL: test_broadcast_16i8_64i8:
;
; X64-AVX512DQVL-LABEL: test_broadcast_16i8_64i8:
; X64-AVX512DQVL: ## %bb.0:
-; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm0
-; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm1
+; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1
+; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; X64-AVX512DQVL-NEXT: retq
%1 = load <16 x i8>, <16 x i8> *%p
%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind {
; X64-AVX512VL-LABEL: test_broadcast_16i16_32i16:
; X64-AVX512VL: ## %bb.0:
-; X64-AVX512VL-NEXT: vmovdqa (%rdi), %ymm1
-; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm0
-; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1
+; X64-AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1
+; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; X64-AVX512VL-NEXT: retq
;
; X64-AVX512BWVL-LABEL: test_broadcast_16i16_32i16:
;
; X64-AVX512DQVL-LABEL: test_broadcast_16i16_32i16:
; X64-AVX512DQVL: ## %bb.0:
-; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %ymm1
-; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm0
-; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1
+; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %ymm0
+; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm1
+; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; X64-AVX512DQVL-NEXT: retq
%1 = load <16 x i16>, <16 x i16> *%p
%2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind {
; X64-AVX512VL-LABEL: test_broadcast_32i8_64i8:
; X64-AVX512VL: ## %bb.0:
-; X64-AVX512VL-NEXT: vmovdqa (%rdi), %ymm1
-; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm0
-; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm1
+; X64-AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1
+; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; X64-AVX512VL-NEXT: retq
;
; X64-AVX512BWVL-LABEL: test_broadcast_32i8_64i8:
;
; X64-AVX512DQVL-LABEL: test_broadcast_32i8_64i8:
; X64-AVX512DQVL: ## %bb.0:
-; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %ymm1
-; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm0
-; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm1
+; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %ymm0
+; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm1
+; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; X64-AVX512DQVL-NEXT: retq
%1 = load <32 x i8>, <32 x i8> *%p
%2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
;
; AVX512F-LABEL: v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpcmpgtw %ymm7, %ymm5, %ymm2
-; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpcmpgtw %ymm6, %ymm4, %ymm2
-; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5
+; AVX512F-NEXT: vpcmpgtw %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm6
+; AVX512F-NEXT: vpcmpgtw %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm1
+; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %ecx
-; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm0
+; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: shll $16, %eax
;
; AVX512F-LABEL: v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT: vpcmpgtb %ymm7, %ymm5, %ymm5
-; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm7
-; AVX512F-NEXT: vpand %xmm7, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpgtb %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm6
-; AVX512F-NEXT: vpand %xmm6, %xmm2, %xmm2
-; AVX512F-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5
+; AVX512F-NEXT: vpcmpgtb %ymm4, %ymm5, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm6
+; AVX512F-NEXT: vpcmpgtb %ymm5, %ymm6, %ymm5
+; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm7
+; AVX512F-NEXT: vpand %xmm7, %xmm6, %xmm6
+; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512F-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: vpmovsxbd %xmm2, %zmm0
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %ecx
; AVX512F-NEXT: shll $16, %ecx
; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpand %xmm5, %xmm1, %xmm0
+; AVX512F-NEXT: vpand %xmm4, %xmm5, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %edx
-; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm0
+; AVX512F-NEXT: vpmovsxbd %xmm6, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: shll $16, %eax
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: shrl $16, %edi
; AVX512F-NEXT: kmovw %edi, %k2
-; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0
-; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
; AVX512F-NEXT: vpsrlw $15, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: ext_i32_32i16:
; AVX512F-LABEL: ext_i64_64i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq %rdi, %rcx
+; AVX512F-NEXT: movl %edi, %ecx
; AVX512F-NEXT: kmovw %edi, %k1
-; AVX512F-NEXT: movl %edi, %edx
-; AVX512F-NEXT: shrl $16, %edx
-; AVX512F-NEXT: shrq $32, %rax
-; AVX512F-NEXT: shrq $48, %rcx
+; AVX512F-NEXT: shrq $32, %rdi
+; AVX512F-NEXT: shrq $48, %rax
+; AVX512F-NEXT: shrl $16, %ecx
; AVX512F-NEXT: kmovw %ecx, %k2
; AVX512F-NEXT: kmovw %eax, %k3
-; AVX512F-NEXT: kmovw %edx, %k4
+; AVX512F-NEXT: kmovw %edi, %k4
; AVX512F-NEXT: movl {{.*}}(%rip), %eax
-; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k4} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k4} {z}
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k3} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k3} {z}
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpbroadcastd %eax, %zmm2 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: ext_i64_64i8:
;
; AVX512F-LABEL: v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %ecx
-; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm0
-; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: shll $16, %eax
;
; AVX512F-LABEL: v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovmskb %ymm0, %ecx
-; AVX512F-NEXT: vpmovmskb %ymm1, %eax
+; AVX512F-NEXT: vpmovmskb %ymm2, %eax
; AVX512F-NEXT: shlq $32, %rax
; AVX512F-NEXT: orq %rcx, %rax
; AVX512F-NEXT: vzeroupper
;
; AVX512F-LABEL: bitcast_64i8_store:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm3
;
; AVX512F-LABEL: bitcast_32i16_store:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: retl
;
-; NO-AVX512BW-LABEL: f64xi8_i16:
-; NO-AVX512BW: # %bb.0:
-; NO-AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
-; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: retl
+; AVX2-LABEL: f64xi8_i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retl
;
; AVX512BW-LABEL: f64xi8_i16:
; AVX512BW: # %bb.0:
; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: retq
;
-; NO-AVX512BW-64-LABEL: f64xi8_i16:
-; NO-AVX512BW-64: # %bb.0:
-; NO-AVX512BW-64-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
-; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: retq
+; AVX2-64-LABEL: f64xi8_i16:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: retq
;
; AVX512BW-64-LABEL: f64xi8_i16:
; AVX512BW-64: # %bb.0:
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: retl
;
-; NO-AVX512BW-LABEL: f64i8_i32:
-; NO-AVX512BW: # %bb.0:
-; NO-AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
-; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: retl
+; AVX2-LABEL: f64i8_i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
+; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retl
;
; AVX512BW-LABEL: f64i8_i32:
; AVX512BW: # %bb.0:
; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: retq
;
-; NO-AVX512BW-64-LABEL: f64i8_i32:
-; NO-AVX512BW-64: # %bb.0:
-; NO-AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
-; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: retq
+; AVX2-64-LABEL: f64i8_i32:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
+; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: retq
;
; AVX512BW-64-LABEL: f64i8_i32:
; AVX512BW-64: # %bb.0:
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: retl
;
-; NO-AVX512BW-LABEL: f64xi8_i64:
-; NO-AVX512BW: # %bb.0:
-; NO-AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275]
-; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: retl
+; AVX2-LABEL: f64xi8_i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275]
+; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retl
;
; AVX512BW-LABEL: f64xi8_i64:
; AVX512BW: # %bb.0:
; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: retq
;
-; NO-AVX512BW-64-LABEL: f64xi8_i64:
-; NO-AVX512BW-64: # %bb.0:
-; NO-AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
-; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: retq
+; AVX2-64-LABEL: f64xi8_i64:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
+; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: retq
;
; AVX512BW-64-LABEL: f64xi8_i64:
; AVX512BW-64: # %bb.0:
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: retl
;
-; NO-AVX512BW-LABEL: f64xi8_i128:
-; NO-AVX512BW: # %bb.0:
-; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NO-AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
-; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: retl
+; AVX2-LABEL: f64xi8_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retl
;
; AVX512BW-LABEL: f64xi8_i128:
; AVX512BW: # %bb.0:
; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: retq
;
-; NO-AVX512BW-64-LABEL: f64xi8_i128:
-; NO-AVX512BW-64: # %bb.0:
-; NO-AVX512BW-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NO-AVX512BW-64-NEXT: # ymm2 = mem[0,1,0,1]
-; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: retq
+; AVX2-64-LABEL: f64xi8_i128:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: retq
;
; AVX512BW-64-LABEL: f64xi8_i128:
; AVX512BW-64: # %bb.0:
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: retl
;
-; NO-AVX512BW-LABEL: f64xi8_i256:
-; NO-AVX512BW: # %bb.0:
-; NO-AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
-; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: retl
+; AVX2-LABEL: f64xi8_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retl
;
; AVX512BW-LABEL: f64xi8_i256:
; AVX512BW: # %bb.0:
; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: retq
;
-; NO-AVX512BW-64-LABEL: f64xi8_i256:
-; NO-AVX512BW-64: # %bb.0:
-; NO-AVX512BW-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
-; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: retq
+; AVX2-64-LABEL: f64xi8_i256:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: retq
;
; AVX512BW-64-LABEL: f64xi8_i256:
; AVX512BW-64: # %bb.0:
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: retl
;
-; NO-AVX512BW-LABEL: f32xi16_i32:
-; NO-AVX512BW: # %bb.0:
-; NO-AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
-; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: retl
+; AVX2-LABEL: f32xi16_i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retl
;
; AVX512BW-LABEL: f32xi16_i32:
; AVX512BW: # %bb.0:
; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: retq
;
-; NO-AVX512BW-64-LABEL: f32xi16_i32:
-; NO-AVX512BW-64: # %bb.0:
-; NO-AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
-; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: retq
+; AVX2-64-LABEL: f32xi16_i32:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
+; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: retq
;
; AVX512BW-64-LABEL: f32xi16_i32:
; AVX512BW-64: # %bb.0:
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: retl
;
-; NO-AVX512BW-LABEL: f32xi16_i64:
-; NO-AVX512BW: # %bb.0:
-; NO-AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309]
-; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: retl
+; AVX2-LABEL: f32xi16_i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309]
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retl
;
; AVX512BW-LABEL: f32xi16_i64:
; AVX512BW: # %bb.0:
; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: retq
;
-; NO-AVX512BW-64-LABEL: f32xi16_i64:
-; NO-AVX512BW-64: # %bb.0:
-; NO-AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096]
-; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: retq
+; AVX2-64-LABEL: f32xi16_i64:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096]
+; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: retq
;
; AVX512BW-64-LABEL: f32xi16_i64:
; AVX512BW-64: # %bb.0:
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: retl
;
-; NO-AVX512BW-LABEL: f32xi16_i128:
-; NO-AVX512BW: # %bb.0:
-; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; NO-AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
-; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: retl
+; AVX2-LABEL: f32xi16_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retl
;
; AVX512BW-LABEL: f32xi16_i128:
; AVX512BW: # %bb.0:
; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: retq
;
-; NO-AVX512BW-64-LABEL: f32xi16_i128:
-; NO-AVX512BW-64: # %bb.0:
-; NO-AVX512BW-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; NO-AVX512BW-64-NEXT: # ymm2 = mem[0,1,0,1]
-; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: retq
+; AVX2-64-LABEL: f32xi16_i128:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: retq
;
; AVX512BW-64-LABEL: f32xi16_i128:
; AVX512BW-64: # %bb.0:
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: retl
;
-; NO-AVX512BW-LABEL: f32xi16_i256:
-; NO-AVX512BW: # %bb.0:
-; NO-AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-NEXT: retl
+; AVX2-LABEL: f32xi16_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retl
;
; AVX512BW-LABEL: f32xi16_i256:
; AVX512BW: # %bb.0:
; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: retq
;
-; NO-AVX512BW-64-LABEL: f32xi16_i256:
-; NO-AVX512BW-64: # %bb.0:
-; NO-AVX512BW-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm0, %ymm0
-; NO-AVX512BW-64-NEXT: vpand %ymm2, %ymm1, %ymm1
-; NO-AVX512BW-64-NEXT: retq
+; AVX2-64-LABEL: f32xi16_i256:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-64-NEXT: retq
;
; AVX512BW-64-LABEL: f32xi16_i256:
; AVX512BW-64: # %bb.0:
}
define <32 x i16> @test_buildvector_v32i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15, i16 %a16, i16 %a17, i16 %a18, i16 %a19, i16 %a20, i16 %a21, i16 %a22, i16 %a23, i16 %a24, i16 %a25, i16 %a26, i16 %a27, i16 %a28, i16 %a29, i16 %a30, i16 %a31) {
-; AVX512F-32-LABEL: test_buildvector_v32i16:
-; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512F-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512F-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512F-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX512F-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512F-32-NEXT: retl
-;
-; AVX512F-64-LABEL: test_buildvector_v32i16:
-; AVX512F-64: # %bb.0:
-; AVX512F-64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512F-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512F-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-64-NEXT: vmovd %edi, %xmm0
-; AVX512F-64-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrw $4, %r8d, %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrw $5, %r9d, %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX512F-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-64-NEXT: retq
-;
-; AVX512BW-32-LABEL: test_buildvector_v32i16:
-; AVX512BW-32: # %bb.0:
-; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512BW-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512BW-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512BW-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX512BW-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512BW-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-32-NEXT: retl
+; AVX-32-LABEL: test_buildvector_v32i16:
+; AVX-32: # %bb.0:
+; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX-32-NEXT: retl
;
-; AVX512BW-64-LABEL: test_buildvector_v32i16:
-; AVX512BW-64: # %bb.0:
-; AVX512BW-64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512BW-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512BW-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-64-NEXT: vmovd %edi, %xmm1
-; AVX512BW-64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrw $4, %r8d, %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrw $5, %r9d, %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX512BW-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512BW-64-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-64-NEXT: retq
+; AVX-64-LABEL: test_buildvector_v32i16:
+; AVX-64: # %bb.0:
+; AVX-64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vmovd %edi, %xmm1
+; AVX-64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrw $4, %r8d, %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrw $5, %r9d, %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrw $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrw $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrw $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX-64-NEXT: retq
%ins0 = insertelement <32 x i16> undef, i16 %a0, i32 0
%ins1 = insertelement <32 x i16> %ins0, i16 %a1, i32 1
%ins2 = insertelement <32 x i16> %ins1, i16 %a2, i32 2
}
define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31, i8 %a32, i8 %a33, i8 %a34, i8 %a35, i8 %a36, i8 %a37, i8 %a38, i8 %a39, i8 %a40, i8 %a41, i8 %a42, i8 %a43, i8 %a44, i8 %a45, i8 %a46, i8 %a47, i8 %a48, i8 %a49, i8 %a50, i8 %a51, i8 %a52, i8 %a53, i8 %a54, i8 %a55, i8 %a56, i8 %a57, i8 %a58, i8 %a59, i8 %a60, i8 %a61, i8 %a62, i8 %a63) {
-; AVX512F-32-LABEL: test_buildvector_v64i8:
-; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512F-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512F-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512F-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512F-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512F-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX512F-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512F-32-NEXT: retl
-;
-; AVX512F-64-LABEL: test_buildvector_v64i8:
-; AVX512F-64: # %bb.0:
-; AVX512F-64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512F-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512F-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512F-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-64-NEXT: vmovd %edi, %xmm0
-; AVX512F-64-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $4, %r8d, %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $5, %r9d, %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512F-64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX512F-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512F-64-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-64-NEXT: retq
-;
-; AVX512BW-32-LABEL: test_buildvector_v64i8:
-; AVX512BW-32: # %bb.0:
-; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512BW-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm0, %xmm0
-; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512BW-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512BW-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm1, %xmm1
-; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX512BW-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; AVX512BW-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512BW-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-32-NEXT: retl
+; AVX-32-LABEL: test_buildvector_v64i8:
+; AVX-32: # %bb.0:
+; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; AVX-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrb $3, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrb $5, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrb $6, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrb $7, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrb $9, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrb $10, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrb $11, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrb $12, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrb $13, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrb $14, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vpinsrb $15, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; AVX-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX-32-NEXT: retl
;
-; AVX512BW-64-LABEL: test_buildvector_v64i8:
-; AVX512BW-64: # %bb.0:
-; AVX512BW-64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512BW-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; AVX512BW-64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512BW-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-64-NEXT: vmovd %edi, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $1, %esi, %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $4, %r8d, %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $5, %r9d, %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm1, %xmm1
-; AVX512BW-64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX512BW-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm2, %xmm2
-; AVX512BW-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512BW-64-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-64-NEXT: retq
+; AVX-64-LABEL: test_buildvector_v64i8:
+; AVX-64: # %bb.0:
+; AVX-64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX-64-NEXT: vmovd %edi, %xmm1
+; AVX-64-NEXT: vpinsrb $1, %esi, %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $4, %r8d, %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $5, %r9d, %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm1, %xmm1
+; AVX-64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrb $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrb $4, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrb $8, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrb $9, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrb $10, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrb $11, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrb $12, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrb $13, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrb $14, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vpinsrb $15, {{[0-9]+}}(%rsp), %xmm2, %xmm2
+; AVX-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX-64-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX-64-NEXT: retq
%ins0 = insertelement <64 x i8> undef, i8 %a0, i32 0
%ins1 = insertelement <64 x i8> %ins0, i8 %a1, i32 1
%ins2 = insertelement <64 x i8> %ins1, i8 %a2, i32 2
;
; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm2
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2]
; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm2
; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
; AVX512F-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vpsravd %zmm4, %zmm2, %zmm2
; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
-; AVX512F-NEXT: vpsraw $15, %ymm1, %ymm2
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm2
; AVX512F-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
; AVX512F-NEXT: vpsravd %zmm4, %zmm2, %zmm2
; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
;
; AVX512VL-LABEL: test_nt64xi8:
; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VL-NEXT: vmovntdq %ymm0, (%rdi)
; AVX512VL-NEXT: vmovntdq %ymm1, 32(%rdi)
; AVX512VL-NEXT: vzeroupper
;
; AVX512F-LABEL: test_nt64xi8:
; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: vmovntdq %ymm0, (%rdi)
; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi)
; AVX512F-NEXT: vzeroupper
;
; AVX512VL-LABEL: test_nt32xi16:
; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VL-NEXT: vmovntdq %ymm0, (%rdi)
; AVX512VL-NEXT: vmovntdq %ymm1, 32(%rdi)
; AVX512VL-NEXT: vzeroupper
;
; AVX512F-LABEL: test_nt32xi16:
; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: vmovntdq %ymm0, (%rdi)
; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: test_load_nt64xi8:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT: retq
-;
-; AVX512F-LABEL: test_load_nt64xi8:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
-; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: test_load_nt64xi8:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_load_nt64xi8:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
entry:
%0 = load <64 x i8>, <64 x i8>* %ptr, align 64, !nontemporal !1
ret <64 x i8> %0
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: test_load_nt32xi16:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT: retq
-;
-; AVX512F-LABEL: test_load_nt32xi16:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
-; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: test_load_nt32xi16:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_load_nt32xi16:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
entry:
%0 = load <32 x i16>, <32 x i16>* %ptr, align 64, !nontemporal !1
ret <32 x i16> %0
; AVXONLY-NEXT: vmovaps 32(%rdi), %ymm1
; AVXONLY-NEXT: retq
;
-; KNL-LABEL: test_v64i8:
-; KNL: # %bb.0: # %entry
-; KNL-NEXT: vmovaps (%rdi), %ymm0
-; KNL-NEXT: vmovaps 32(%rdi), %ymm1
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_v64i8:
-; SKX: # %bb.0: # %entry
-; SKX-NEXT: vmovdqa64 (%rdi), %zmm0
-; SKX-NEXT: retq
+; AVX512-LABEL: test_v64i8:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT: retq
entry:
%0 = load <64 x i8>, <64 x i8>* %V, align 64
ret <64 x i8> %0
; AVXONLY-NEXT: vmovaps 32(%rdi), %ymm1
; AVXONLY-NEXT: retq
;
-; KNL-LABEL: test_v32i16:
-; KNL: # %bb.0: # %entry
-; KNL-NEXT: vmovaps (%rdi), %ymm0
-; KNL-NEXT: vmovaps 32(%rdi), %ymm1
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_v32i16:
-; SKX: # %bb.0: # %entry
-; SKX-NEXT: vmovdqa64 (%rdi), %zmm0
-; SKX-NEXT: retq
+; AVX512-LABEL: test_v32i16:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT: retq
entry:
%0 = load <32 x i16>, <32 x i16>* %V, align 64
ret <32 x i16> %0
; AVXONLY-NEXT: vmovups 32(%rdi), %ymm1
; AVXONLY-NEXT: retq
;
-; KNL-LABEL: test_v64i8_unaligned:
-; KNL: # %bb.0: # %entry
-; KNL-NEXT: vmovups (%rdi), %ymm0
-; KNL-NEXT: vmovups 32(%rdi), %ymm1
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_v64i8_unaligned:
-; SKX: # %bb.0: # %entry
-; SKX-NEXT: vmovdqu64 (%rdi), %zmm0
-; SKX-NEXT: retq
+; AVX512-LABEL: test_v64i8_unaligned:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512-NEXT: retq
entry:
%0 = load <64 x i8>, <64 x i8>* %V, align 4
ret <64 x i8> %0
; AVXONLY-NEXT: vmovups 32(%rdi), %ymm1
; AVXONLY-NEXT: retq
;
-; KNL-LABEL: test_v32i16_unaligned:
-; KNL: # %bb.0: # %entry
-; KNL-NEXT: vmovups (%rdi), %ymm0
-; KNL-NEXT: vmovups 32(%rdi), %ymm1
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_v32i16_unaligned:
-; SKX: # %bb.0: # %entry
-; SKX-NEXT: vmovdqu64 (%rdi), %zmm0
-; SKX-NEXT: retq
+; AVX512-LABEL: test_v32i16_unaligned:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512-NEXT: retq
entry:
%0 = load <32 x i16>, <32 x i16>* %V, align 4
ret <32 x i16> %0
define i32 @kshiftl_v32i1_1(<32 x i16> %x, <32 x i16> %y) {
; KNL-LABEL: kshiftl_v32i1_1:
; KNL: # %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm1, %ymm1
-; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm3
+; KNL-NEXT: vpmovsxwd %ymm3, %zmm3
+; KNL-NEXT: vptestmd %zmm3, %zmm3, %k1
; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[15],zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[15],zmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; KNL-NEXT: kshiftlw $1, %k2, %k1
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm1
-; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k2
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm1
+; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm2
+; KNL-NEXT: vpmovsxwd %ymm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2
+; KNL-NEXT: vpcmpeqw %ymm4, %ymm1, %ymm1
; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %ecx
define i64 @kshiftl_v64i1_1(<64 x i8> %x, <64 x i8> %y) {
; KNL-LABEL: kshiftl_v64i1_1:
; KNL: # %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm5
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm0
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k3
+; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm0
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm3
+; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
+; KNL-NEXT: vptestmd %zmm3, %zmm3, %k3
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm1 = zmm0[15],zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm3 = zmm0[15],zmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z}
; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm5[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z}
; KNL-NEXT: valignd {{.*#+}} zmm5 = zmm6[15],zmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; KNL-NEXT: kshiftlw $1, %k1, %k3
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3
-; KNL-NEXT: vextracti128 $1, %ymm3, %xmm6
+; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2
+; KNL-NEXT: vextracti128 $1, %ymm2, %xmm6
; KNL-NEXT: vpmovsxbd %xmm6, %zmm6
; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1
-; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
-; KNL-NEXT: vptestmd %zmm3, %zmm3, %k2
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2
-; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3
-; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
-; KNL-NEXT: vptestmd %zmm3, %zmm3, %k4
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
-; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 {%k3}
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2
+; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k4
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k3}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vptestmd %zmm5, %zmm5, %k0 {%k4}
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: orl %eax, %ecx
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k2}
; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
; KNL-NEXT: orl %edx, %eax
define i32 @kshiftl_v32i1_31(<32 x i16> %x, <32 x i16> %y) {
; KNL-LABEL: kshiftl_v32i1_31:
; KNL: # %bb.0:
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm0
+; KNL-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
define i64 @kshiftl_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
; KNL-LABEL: kshiftl_v64i1_63:
; KNL: # %bb.0:
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: vextracti128 $1, %ymm3, %xmm0
-; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm0
+; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
define i32 @kshiftr_v32i1_1(<32 x i16> %x, <32 x i16> %y) {
; KNL-LABEL: kshiftr_v32i1_1:
; KNL: # %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm1, %ymm1
-; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm3
+; KNL-NEXT: vpmovsxwd %ymm3, %zmm3
+; KNL-NEXT: vptestmd %zmm3, %zmm3, %k1
; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0]
+; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm3[0]
; KNL-NEXT: kshiftrw $1, %k1, %k1
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm1
+; KNL-NEXT: vpcmpeqw %ymm4, %ymm1, %ymm1
; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k2
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm1
+; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm1
; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %ecx
define i64 @kshiftr_v64i1_1(<64 x i8> %x, <64 x i8> %y) {
; KNL-LABEL: kshiftr_v64i1_1:
; KNL: # %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm5
+; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3
+; KNL-NEXT: vextracti128 $1, %ymm3, %xmm5
; KNL-NEXT: vpmovsxbd %xmm5, %zmm5
; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1
; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm5
; KNL-NEXT: vptestmd %zmm5, %zmm5, %k2
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k3
+; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
+; KNL-NEXT: vptestmd %zmm3, %zmm3, %k3
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm5 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0]
+; KNL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm5 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm3[0]
; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm6[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm6[0]
+; KNL-NEXT: valignd {{.*#+}} zmm3 = zmm3[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm6[0]
; KNL-NEXT: kshiftrw $1, %k1, %k3
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2
-; KNL-NEXT: vextracti128 $1, %ymm2, %xmm6
+; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm6
; KNL-NEXT: vpmovsxbd %xmm6, %zmm6
; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1
-; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
-; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm2
-; KNL-NEXT: vpmovsxbd %xmm2, %zmm3
-; KNL-NEXT: vptestmd %zmm3, %zmm3, %k4
-; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2
-; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
-; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 {%k3}
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k2
+; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k4
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k3}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k4}
+; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0 {%k4}
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: orl %eax, %ecx
; KNL-NEXT: shlq $32, %rcx
define i32 @kshiftr_v32i1_31(<32 x i16> %x, <32 x i16> %y) {
; KNL-LABEL: kshiftr_v32i1_31:
; KNL: # %bb.0:
-; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; KNL-NEXT: vpcmpeqw %ymm0, %ymm1, %ymm1
-; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k1
-; KNL-NEXT: vpcmpeqw %ymm0, %ymm2, %ymm0
+; KNL-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
define i64 @kshiftr_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
; KNL-LABEL: kshiftr_v64i1_63:
; KNL: # %bb.0:
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm0
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k1
-; KNL-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm0
+; KNL-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
;
; AVX512F-LABEL: pmaddwd_32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: pmaddwd_32:
;
; AVX512F-LABEL: jumbled_indices16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: jumbled_indices16:
;
; AVX512F-LABEL: truncstore_v32i16_v32i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovmskb %ymm2, %eax
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovmskb %ymm1, %eax
; AVX512F-NEXT: notl %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB15_1
;
; AVX512F-LABEL: truncstore_v32i16_v32i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpacksswb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512F-NEXT: vpmovmskb %ymm2, %eax
+; AVX512F-NEXT: vpmovmskb %ymm1, %eax
; AVX512F-NEXT: notl %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB15_1
;
; AVX512F-LABEL: truncstore_v32i16_v32i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpminuw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpminuw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpminuw %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovmskb %ymm2, %eax
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovmskb %ymm1, %eax
; AVX512F-NEXT: notl %eax
; AVX512F-NEXT: testb $1, %al
; AVX512F-NEXT: jne .LBB15_1
; AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vmovaps %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i16, i16* %ptr, i64 1
%ptr1 = getelementptr inbounds i16, i16* %ptr, i64 2
; AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vmovaps %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2
%ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3
; AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vmovaps %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1
%ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2
; AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vmovaps %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1
%ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2
define <32 x i16> @vec512_i16_signed_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nounwind {
; AVX512F-LABEL: vec512_i16_signed_reg_reg:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm6
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm6
; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm6
-; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm7
-; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm6
+; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm7
+; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsubw %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3
-; AVX512F-NEXT: vpsubw %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vpsubw %ymm7, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_reg:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm4
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm6
; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm7
+; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i16_signed_reg_reg:
define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nounwind {
; AVX512F-LABEL: vec512_i16_unsigned_reg_reg:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpminuw %ymm2, %ymm0, %ymm4
-; AVX512F-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm5
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpminuw %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm5
; AVX512F-NEXT: vpternlogq $15, %zmm5, %zmm5, %zmm5
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpminuw %ymm3, %ymm1, %ymm7
-; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm1, %ymm8
+; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm7
+; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm0, %ymm8
; AVX512F-NEXT: vpternlogq $15, %zmm8, %zmm8, %zmm8
; AVX512F-NEXT: vpor %ymm6, %ymm8, %ymm6
-; AVX512F-NEXT: vpmaxuw %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpmaxuw %ymm3, %ymm1, %ymm3
-; AVX512F-NEXT: vpsubw %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vpsubw %ymm7, %ymm1, %ymm1
; AVX512F-NEXT: vpsubw %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i16_unsigned_reg_reg:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vpminuw %ymm2, %ymm0, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm5
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpminuw %ymm2, %ymm3, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm5
; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm5, %ymm5, %ymm5
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpminuw %ymm3, %ymm1, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm7, %ymm1, %ymm8
+; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm7
+; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm7, %ymm0, %ymm8
; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm8, %ymm8, %ymm8
; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm8, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm3, %ymm1, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm4, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i16_unsigned_reg_reg:
define <32 x i16> @vec512_i16_signed_mem_reg(<32 x i16>* %a1_addr, <32 x i16> %a2) nounwind {
; AVX512F-LABEL: vec512_i16_signed_mem_reg:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm4
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm6
+; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm6
; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm6
-; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm7
-; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpsubw %ymm6, %ymm0, %ymm0
+; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm6
+; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm7
; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpsubw %ymm7, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpsubw %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpsubw %ymm7, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0
; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_reg:
; AVX512VL-FALLBACK: # %bb.0:
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm6
; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm7
; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i16_signed_mem_reg:
define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, <32 x i16>* %a2_addr) nounwind {
; AVX512F-LABEL: vec512_i16_signed_reg_mem:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm4
+; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm6
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm6
; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm6
-; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm7
-; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpsubw %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm6
+; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm7
; AVX512F-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3
-; AVX512F-NEXT: vpsubw %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpsubw %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpsubw %ymm7, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_mem:
; AVX512VL-FALLBACK: # %bb.0:
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm6
; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm7
; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i16_signed_reg_mem:
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512F-NEXT: vmovdqa (%rsi), %ymm2
; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm3
-; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm4
+; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm6
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm6
; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm6
-; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm7
-; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpsubw %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpminsw %ymm3, %ymm1, %ymm6
+; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm7
; AVX512F-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3
-; AVX512F-NEXT: vpsubw %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpsubw %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpsubw %ymm7, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpaddw %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_mem:
; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm2
; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm3
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm6
; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm1, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm0, %ymm7
; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm3, %ymm1, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i16_signed_mem_mem:
define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind {
; AVX512F-LABEL: vec512_i8_signed_reg_reg:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm6
+; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm6
; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm6
-; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm7
-; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3
-; AVX512F-NEXT: vpsubb %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm6
+; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm7
+; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3
+; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vpsubb %ymm7, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
; AVX512F-NEXT: vpmullw %ymm7, %ymm6, %ymm6
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
-; AVX512F-NEXT: vpmullw %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpackuswb %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
-; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpackuswb %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_reg:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm4
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm6
; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm7
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm6, %ymm6
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm6, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5
; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i8_signed_reg_reg:
define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwind {
; AVX512F-LABEL: vec512_i8_unsigned_reg_reg:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpminub %ymm2, %ymm0, %ymm4
-; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm5
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpminub %ymm3, %ymm2, %ymm4
+; AVX512F-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5
; AVX512F-NEXT: vpternlogq $15, %zmm5, %zmm5, %zmm5
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpor %ymm6, %ymm5, %ymm5
-; AVX512F-NEXT: vpminub %ymm3, %ymm1, %ymm7
-; AVX512F-NEXT: vpcmpeqb %ymm7, %ymm1, %ymm8
+; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm7
+; AVX512F-NEXT: vpcmpeqb %ymm7, %ymm0, %ymm8
; AVX512F-NEXT: vpternlogq $15, %zmm8, %zmm8, %zmm8
; AVX512F-NEXT: vpor %ymm6, %ymm8, %ymm6
-; AVX512F-NEXT: vpmaxub %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpmaxub %ymm3, %ymm1, %ymm3
-; AVX512F-NEXT: vpsubb %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpmaxub %ymm3, %ymm2, %ymm3
+; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vpsubb %ymm7, %ymm1, %ymm1
+; AVX512F-NEXT: vpsubb %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15],ymm6[24],ymm0[24],ymm6[25],ymm0[25],ymm6[26],ymm0[26],ymm6[27],ymm0[27],ymm6[28],ymm0[28],ymm6[29],ymm0[29],ymm6[30],ymm0[30],ymm6[31],ymm0[31]
; AVX512F-NEXT: vpmullw %ymm7, %ymm4, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[16],ymm0[16],ymm6[17],ymm0[17],ymm6[18],ymm0[18],ymm6[19],ymm0[19],ymm6[20],ymm0[20],ymm6[21],ymm0[21],ymm6[22],ymm0[22],ymm6[23],ymm0[23]
-; AVX512F-NEXT: vpmullw %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512F-NEXT: vpmullw %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
; AVX512F-NEXT: vpmullw %ymm6, %ymm4, %ymm4
; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
-; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpmullw %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i8_unsigned_reg_reg:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vpminub %ymm2, %ymm0, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm5
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpminub %ymm3, %ymm2, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5
; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm5, %ymm5, %ymm5
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpminub %ymm3, %ymm1, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm7, %ymm1, %ymm8
+; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm7
+; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm7, %ymm0, %ymm8
; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm8, %ymm8, %ymm8
; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm8, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm3, %ymm1, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm3, %ymm2, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm3, %ymm3
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15],ymm6[24],ymm0[24],ymm6[25],ymm0[25],ymm6[26],ymm0[26],ymm6[27],ymm0[27],ymm6[28],ymm0[28],ymm6[29],ymm0[29],ymm6[30],ymm0[30],ymm6[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm4, %ymm4
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[16],ymm0[16],ymm6[17],ymm0[17],ymm6[18],ymm0[18],ymm6[19],ymm0[19],ymm6[20],ymm0[20],ymm6[21],ymm0[21],ymm6[22],ymm0[22],ymm6[23],ymm0[23]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm4, %ymm4
; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i8_unsigned_reg_reg:
define <64 x i8> @vec512_i8_signed_mem_reg(<64 x i8>* %a1_addr, <64 x i8> %a2) nounwind {
; AVX512F-LABEL: vec512_i8_signed_mem_reg:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm6
+; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm6
; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm6
-; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm7
-; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpsubb %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpsubb %ymm7, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm6
+; AVX512F-NEXT: vpminsb %ymm0, %ymm1, %ymm7
+; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3
+; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpsubb %ymm7, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
; AVX512F-NEXT: vpmullw %ymm7, %ymm6, %ymm6
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
-; AVX512F-NEXT: vpmullw %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512F-NEXT: vpmullw %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm7, %ymm0, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm6, %ymm0, %ymm0
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
-; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm7, %ymm0, %ymm0
-; AVX512F-NEXT: vpackuswb %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_reg:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm4
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm6
; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm7
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm6, %ymm6
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm6, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5
; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i8_signed_mem_reg:
define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, <64 x i8>* %a2_addr) nounwind {
; AVX512F-LABEL: vec512_i8_signed_reg_mem:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm4
+; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm6
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm6
; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm6
-; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm7
-; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm6
+; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm7
; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3
-; AVX512F-NEXT: vpsubb %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpsubb %ymm7, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
; AVX512F-NEXT: vpmullw %ymm7, %ymm6, %ymm6
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
-; AVX512F-NEXT: vpmullw %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpackuswb %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
-; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpackuswb %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem:
; AVX512VL-FALLBACK: # %bb.0:
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm6
; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm7
; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm6, %ymm6
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm6, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5
; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i8_signed_reg_mem:
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512F-NEXT: vmovdqa (%rsi), %ymm2
; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm3
-; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm4
+; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm6
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm6
; AVX512F-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm6
-; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm7
-; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2
-; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpminsb %ymm3, %ymm1, %ymm6
+; AVX512F-NEXT: vpminsb %ymm2, %ymm0, %ymm7
; AVX512F-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3
-; AVX512F-NEXT: vpsubb %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpsubb %ymm7, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
; AVX512F-NEXT: vpmullw %ymm7, %ymm6, %ymm6
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
-; AVX512F-NEXT: vpmullw %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpackuswb %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512F-NEXT: vpmullw %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
; AVX512F-NEXT: vpmullw %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
-; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpackuswb %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_mem:
; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm2
; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm3
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm6
; AVX512VL-FALLBACK-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm1, %ymm6
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm0, %ymm7
; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm1, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm6, %ymm6
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm6, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm6, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5
; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
+; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i8_signed_mem_mem:
;
; KNL-LABEL: allones_v64i8_sign:
; KNL: # %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpmovmskb %ymm1, %eax
; KNL-NEXT: shlq $32, %rax
; KNL-NEXT: vpmovmskb %ymm0, %ecx
;
; KNL-LABEL: allzeros_v64i8_sign:
; KNL: # %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpmovmskb %ymm1, %eax
; KNL-NEXT: shlq $32, %rax
; KNL-NEXT: vpmovmskb %ymm0, %ecx
;
; KNL-LABEL: allones_v32i16_sign:
; KNL: # %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
;
; KNL-LABEL: allzeros_v32i16_sign:
; KNL: # %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
;
; KNL-LABEL: allones_v64i8_and1:
; KNL: # %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
; KNL-NEXT: vpsllw $7, %ymm1, %ymm1
; KNL-NEXT: vpmovmskb %ymm1, %eax
;
; KNL-LABEL: allzeros_v64i8_and1:
; KNL: # %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
; KNL-NEXT: vpsllw $7, %ymm1, %ymm1
; KNL-NEXT: vpmovmskb %ymm1, %eax
;
; KNL-LABEL: allones_v32i16_and1:
; KNL: # %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
;
; KNL-LABEL: allzeros_v32i16_and1:
; KNL: # %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
;
; KNL-LABEL: allones_v64i8_and4:
; KNL: # %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpsllw $5, %ymm0, %ymm0
; KNL-NEXT: vpsllw $5, %ymm1, %ymm1
; KNL-NEXT: vpmovmskb %ymm1, %eax
;
; KNL-LABEL: allzeros_v64i8_and4:
; KNL: # %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpsllw $5, %ymm0, %ymm0
; KNL-NEXT: vpsllw $5, %ymm1, %ymm1
; KNL-NEXT: vpmovmskb %ymm1, %eax
;
; KNL-LABEL: allones_v32i16_and4:
; KNL: # %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpsllw $13, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
;
; KNL-LABEL: allzeros_v32i16_and4:
; KNL: # %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpsllw $13, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX2-NEXT: vmovups 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
-; AVX512DQ-LABEL: test_v32i16_align1:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovups (%rdi), %ymm0
-; AVX512DQ-NEXT: vmovups 32(%rdi), %ymm1
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_v32i16_align1:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovups (%rdi), %zmm0
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_v32i16_align1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovups (%rdi), %zmm0
+; AVX512-NEXT: retq
%1 = load <32 x i16>, <32 x i16>* %src, align 1, !nontemporal !1
ret <32 x i16> %1
}
; AVX2-NEXT: vmovups 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
-; AVX512DQ-LABEL: test_v64i8_align1:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovups (%rdi), %ymm0
-; AVX512DQ-NEXT: vmovups 32(%rdi), %ymm1
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_v64i8_align1:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovups (%rdi), %zmm0
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_v64i8_align1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovups (%rdi), %zmm0
+; AVX512-NEXT: retq
%1 = load <64 x i8>, <64 x i8>* %src, align 1, !nontemporal !1
ret <64 x i8> %1
}
; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
; AVX512DQ-NEXT: vmovaps (%rsp), %ymm0
-; AVX512DQ-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX512DQ-NEXT: vinsertf64x4 $1, {{[0-9]+}}(%rsp), %zmm0, %zmm0
; AVX512DQ-NEXT: movq %rbp, %rsp
; AVX512DQ-NEXT: popq %rbp
; AVX512DQ-NEXT: retq
; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa %xmm0, {{[0-9]+}}(%rsp)
; AVX512DQ-NEXT: vmovaps (%rsp), %ymm0
-; AVX512DQ-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX512DQ-NEXT: vinsertf64x4 $1, {{[0-9]+}}(%rsp), %zmm0, %zmm0
; AVX512DQ-NEXT: movq %rbp, %rsp
; AVX512DQ-NEXT: popq %rbp
; AVX512DQ-NEXT: retq
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_v32i16_align32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_v64i8_align32:
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: test_v32i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
-; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: test_v32i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v32i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_v32i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
%1 = load <32 x i16>, <32 x i16>* %src, align 64, !nontemporal !1
ret <32 x i16> %1
}
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: test_v64i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
-; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: test_v64i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v64i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_v64i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
+; AVX512-NEXT: retq
%1 = load <64 x i8>, <64 x i8>* %src, align 64, !nontemporal !1
ret <64 x i8> %1
}
;
; AVX512F-LABEL: test_arg_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm2
-; AVX512F-NEXT: vmovntdqa (%rdi), %ymm3
-; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vmovntdqa (%rdi), %ymm2
+; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm3
+; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_arg_v32i16:
;
; AVX512VL-LABEL: test_arg_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm2
-; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm3
-; AVX512VL-NEXT: vpaddw %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm2
+; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm3
+; AVX512VL-NEXT: vpaddw %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
%1 = load <32 x i16>, <32 x i16>* %src, align 64, !nontemporal !1
%2 = add <32 x i16> %arg, %1
;
; AVX512F-LABEL: test_arg_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm2
-; AVX512F-NEXT: vmovntdqa (%rdi), %ymm3
-; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vmovntdqa (%rdi), %ymm2
+; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm3
+; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_arg_v64i8:
;
; AVX512VL-LABEL: test_arg_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm2
-; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm3
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm2
+; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm3
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
%1 = load <64 x i8>, <64 x i8>* %src, align 64, !nontemporal !1
%2 = add <64 x i8> %arg, %1
; AVX-NEXT: vmovups 32(%rdi), %ymm1
; AVX-NEXT: retq
;
-; AVX512F-LABEL: test_unaligned_v32i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovups (%rdi), %ymm0
-; AVX512F-NEXT: vmovups 32(%rdi), %ymm1
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: test_unaligned_v32i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovups (%rdi), %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_unaligned_v32i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovups (%rdi), %ymm0
-; AVX512VL-NEXT: vmovups 32(%rdi), %ymm1
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_unaligned_v32i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovups (%rdi), %zmm0
+; AVX512-NEXT: retq
%1 = load <32 x i16>, <32 x i16>* %src, align 1, !nontemporal !1
ret <32 x i16> %1
}
; AVX-NEXT: vmovups 32(%rdi), %ymm1
; AVX-NEXT: retq
;
-; AVX512F-LABEL: test_unaligned_v64i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovups (%rdi), %ymm0
-; AVX512F-NEXT: vmovups 32(%rdi), %ymm1
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: test_unaligned_v64i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovups (%rdi), %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_unaligned_v64i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovups (%rdi), %ymm0
-; AVX512VL-NEXT: vmovups 32(%rdi), %ymm1
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_unaligned_v64i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovups (%rdi), %zmm0
+; AVX512-NEXT: retq
%1 = load <64 x i8>, <64 x i8>* %src, align 1, !nontemporal !1
ret <64 x i8> %1
}
;
; AVX512F-LABEL: mul_v64i8c:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX512F-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_v64i8c:
;
; AVX512F-LABEL: mul_v64i8:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_v64i8:
;
; AVX512F-LABEL: mulhuw_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpmulhuw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mulhuw_v32i16:
;
; AVX512F-LABEL: mulhw_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpmulhw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpmulhw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mulhw_v32i16:
define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) {
; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
;
; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
;
; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
-; X32-AVX512F-LABEL: test_broadcast_8i16_32i16:
-; X32-AVX512F: # %bb.0:
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-AVX512F-NEXT: vmovdqa %ymm0, %ymm1
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: test_broadcast_8i16_32i16:
-; X32-AVX512BW: # %bb.0:
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: test_broadcast_8i16_32i16:
-; X32-AVX512DQ: # %bb.0:
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
-; X32-AVX512DQ-NEXT: retl
+; X32-AVX512-LABEL: test_broadcast_8i16_32i16:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_8i16_32i16:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
-; X64-AVX512F-LABEL: test_broadcast_8i16_32i16:
-; X64-AVX512F: # %bb.0:
-; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX512F-NEXT: vmovdqa %ymm0, %ymm1
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: test_broadcast_8i16_32i16:
-; X64-AVX512BW: # %bb.0:
-; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; X64-AVX512BW-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: test_broadcast_8i16_32i16:
-; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX512-LABEL: test_broadcast_8i16_32i16:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT: retq
%1 = load <8 x i16>, <8 x i16> *%p
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <32 x i16> %2
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
-; X32-AVX512F-LABEL: test_broadcast_16i16_32i16:
-; X32-AVX512F: # %bb.0:
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vmovaps (%eax), %ymm0
-; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: test_broadcast_16i16_32i16:
-; X32-AVX512BW: # %bb.0:
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: test_broadcast_16i16_32i16:
-; X32-AVX512DQ: # %bb.0:
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0
-; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
-; X32-AVX512DQ-NEXT: retl
+; X32-AVX512-LABEL: test_broadcast_16i16_32i16:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_16i16_32i16:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
-; X64-AVX512F-LABEL: test_broadcast_16i16_32i16:
-; X64-AVX512F: # %bb.0:
-; X64-AVX512F-NEXT: vmovaps (%rdi), %ymm0
-; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: test_broadcast_16i16_32i16:
-; X64-AVX512BW: # %bb.0:
-; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
-; X64-AVX512BW-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: test_broadcast_16i16_32i16:
-; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
-; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX512-LABEL: test_broadcast_16i16_32i16:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT: retq
%1 = load <16 x i16>, <16 x i16> *%p
%2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <32 x i16> %2
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
-; X32-AVX512F-LABEL: test_broadcast_16i8_64i8:
-; X32-AVX512F: # %bb.0:
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-AVX512F-NEXT: vmovdqa %ymm0, %ymm1
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: test_broadcast_16i8_64i8:
-; X32-AVX512BW: # %bb.0:
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: test_broadcast_16i8_64i8:
-; X32-AVX512DQ: # %bb.0:
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
-; X32-AVX512DQ-NEXT: retl
+; X32-AVX512-LABEL: test_broadcast_16i8_64i8:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_16i8_64i8:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
-; X64-AVX512F-LABEL: test_broadcast_16i8_64i8:
-; X64-AVX512F: # %bb.0:
-; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX512F-NEXT: vmovdqa %ymm0, %ymm1
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: test_broadcast_16i8_64i8:
-; X64-AVX512BW: # %bb.0:
-; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; X64-AVX512BW-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: test_broadcast_16i8_64i8:
-; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX512-LABEL: test_broadcast_16i8_64i8:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT: retq
%1 = load <16 x i8>, <16 x i8> *%p
%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <64 x i8> %2
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
-; X32-AVX512F-LABEL: test_broadcast_32i8_64i8:
-; X32-AVX512F: # %bb.0:
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vmovaps (%eax), %ymm0
-; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: test_broadcast_32i8_64i8:
-; X32-AVX512BW: # %bb.0:
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: test_broadcast_32i8_64i8:
-; X32-AVX512DQ: # %bb.0:
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0
-; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
-; X32-AVX512DQ-NEXT: retl
+; X32-AVX512-LABEL: test_broadcast_32i8_64i8:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_32i8_64i8:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
-; X64-AVX512F-LABEL: test_broadcast_32i8_64i8:
-; X64-AVX512F: # %bb.0:
-; X64-AVX512F-NEXT: vmovaps (%rdi), %ymm0
-; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: test_broadcast_32i8_64i8:
-; X64-AVX512BW: # %bb.0:
-; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
-; X64-AVX512BW-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: test_broadcast_32i8_64i8:
-; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
-; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX512-LABEL: test_broadcast_32i8_64i8:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT: retq
%1 = load <32 x i8>, <32 x i8> *%p
%2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <64 x i8> %2
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
-; X32-AVX512F-LABEL: reg_broadcast_8i16_32i16:
-; X32-AVX512F: # %bb.0:
-; X32-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: reg_broadcast_8i16_32i16:
-; X32-AVX512BW: # %bb.0:
-; X32-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: reg_broadcast_8i16_32i16:
-; X32-AVX512DQ: # %bb.0:
-; X32-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
-; X32-AVX512DQ-NEXT: retl
+; X32-AVX512-LABEL: reg_broadcast_8i16_32i16:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: reg_broadcast_8i16_32i16:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
-; X64-AVX512F-LABEL: reg_broadcast_8i16_32i16:
-; X64-AVX512F: # %bb.0:
-; X64-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: reg_broadcast_8i16_32i16:
-; X64-AVX512BW: # %bb.0:
-; X64-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X64-AVX512BW-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: reg_broadcast_8i16_32i16:
-; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX512-LABEL: reg_broadcast_8i16_32i16:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <32 x i16> %1
}
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
-; X32-AVX512F-LABEL: reg_broadcast_16i16_32i16:
-; X32-AVX512F: # %bb.0:
-; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: reg_broadcast_16i16_32i16:
-; X32-AVX512BW: # %bb.0:
-; X32-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: reg_broadcast_16i16_32i16:
-; X32-AVX512DQ: # %bb.0:
-; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
-; X32-AVX512DQ-NEXT: retl
+; X32-AVX512-LABEL: reg_broadcast_16i16_32i16:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: reg_broadcast_16i16_32i16:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
-; X64-AVX512F-LABEL: reg_broadcast_16i16_32i16:
-; X64-AVX512F: # %bb.0:
-; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: reg_broadcast_16i16_32i16:
-; X64-AVX512BW: # %bb.0:
-; X64-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X64-AVX512BW-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: reg_broadcast_16i16_32i16:
-; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX512-LABEL: reg_broadcast_16i16_32i16:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
%1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <32 x i16> %1
}
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
-; X32-AVX512F-LABEL: reg_broadcast_16i8_64i8:
-; X32-AVX512F: # %bb.0:
-; X32-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: reg_broadcast_16i8_64i8:
-; X32-AVX512BW: # %bb.0:
-; X32-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: reg_broadcast_16i8_64i8:
-; X32-AVX512DQ: # %bb.0:
-; X32-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
-; X32-AVX512DQ-NEXT: retl
+; X32-AVX512-LABEL: reg_broadcast_16i8_64i8:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: reg_broadcast_16i8_64i8:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
-; X64-AVX512F-LABEL: reg_broadcast_16i8_64i8:
-; X64-AVX512F: # %bb.0:
-; X64-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: reg_broadcast_16i8_64i8:
-; X64-AVX512BW: # %bb.0:
-; X64-AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X64-AVX512BW-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: reg_broadcast_16i8_64i8:
-; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX512-LABEL: reg_broadcast_16i8_64i8:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <64 x i8> %1
}
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
-; X32-AVX512F-LABEL: reg_broadcast_32i8_64i8:
-; X32-AVX512F: # %bb.0:
-; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: reg_broadcast_32i8_64i8:
-; X32-AVX512BW: # %bb.0:
-; X32-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: reg_broadcast_32i8_64i8:
-; X32-AVX512DQ: # %bb.0:
-; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
-; X32-AVX512DQ-NEXT: retl
+; X32-AVX512-LABEL: reg_broadcast_32i8_64i8:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: reg_broadcast_32i8_64i8:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
-; X64-AVX512F-LABEL: reg_broadcast_32i8_64i8:
-; X64-AVX512F: # %bb.0:
-; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: reg_broadcast_32i8_64i8:
-; X64-AVX512BW: # %bb.0:
-; X64-AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X64-AVX512BW-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: reg_broadcast_32i8_64i8:
-; X64-AVX512DQ: # %bb.0:
-; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX512-LABEL: reg_broadcast_32i8_64i8:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
%1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <64 x i8> %1
}
; NOBW-NEXT: movq %rsp, %rbp
; NOBW-NEXT: andq $-64, %rsp
; NOBW-NEXT: subq $2112, %rsp # imm = 0x840
+; NOBW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; NOBW-NEXT: vextracti128 $1, %ymm1, %xmm3
; NOBW-NEXT: vextracti128 $1, %ymm2, %xmm4
; NOBW-NEXT: vmovd %xmm4, %eax
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, (%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm0, (%rsp)
-; NOBW-NEXT: movzwl 1472(%rsp,%rax,2), %eax
+; NOBW-NEXT: movzwl 1536(%rsp,%rax,2), %eax
; NOBW-NEXT: vmovd %eax, %xmm0
; NOBW-NEXT: vpextrw $1, %xmm4, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $1, 1408(%rsp,%rax,2), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrw $1, 1600(%rsp,%rax,2), %xmm0, %xmm0
; NOBW-NEXT: vpextrw $2, %xmm4, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $2, 1344(%rsp,%rax,2), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrw $2, 1664(%rsp,%rax,2), %xmm0, %xmm0
; NOBW-NEXT: vpextrw $3, %xmm4, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $3, 1280(%rsp,%rax,2), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrw $3, 1728(%rsp,%rax,2), %xmm0, %xmm0
; NOBW-NEXT: vpextrw $4, %xmm4, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $4, 1216(%rsp,%rax,2), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrw $4, 1792(%rsp,%rax,2), %xmm0, %xmm0
; NOBW-NEXT: vpextrw $5, %xmm4, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $5, 1152(%rsp,%rax,2), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrw $5, 1856(%rsp,%rax,2), %xmm0, %xmm0
; NOBW-NEXT: vpextrw $6, %xmm4, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $6, 1088(%rsp,%rax,2), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrw $6, 1920(%rsp,%rax,2), %xmm0, %xmm0
; NOBW-NEXT: vpextrw $7, %xmm4, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $7, 1024(%rsp,%rax,2), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrw $7, 1984(%rsp,%rax,2), %xmm0, %xmm0
; NOBW-NEXT: vmovd %xmm2, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: movzwl 1984(%rsp,%rax,2), %eax
-; NOBW-NEXT: vmovd %eax, %xmm1
-; NOBW-NEXT: vpextrw $1, %xmm2, %eax
-; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $1, 1920(%rsp,%rax,2), %xmm1, %xmm1
-; NOBW-NEXT: vpextrw $2, %xmm2, %eax
-; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $2, 1856(%rsp,%rax,2), %xmm1, %xmm1
-; NOBW-NEXT: vpextrw $3, %xmm2, %eax
-; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $3, 1792(%rsp,%rax,2), %xmm1, %xmm1
-; NOBW-NEXT: vpextrw $4, %xmm2, %eax
-; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $4, 1728(%rsp,%rax,2), %xmm1, %xmm1
-; NOBW-NEXT: vpextrw $5, %xmm2, %eax
-; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $5, 1664(%rsp,%rax,2), %xmm1, %xmm1
-; NOBW-NEXT: vpextrw $6, %xmm2, %eax
-; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $6, 1600(%rsp,%rax,2), %xmm1, %xmm1
-; NOBW-NEXT: vpextrw $7, %xmm2, %eax
-; NOBW-NEXT: vextracti128 $1, %ymm3, %xmm2
-; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $7, 1536(%rsp,%rax,2), %xmm1, %xmm1
-; NOBW-NEXT: vmovd %xmm2, %eax
-; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: movzwl 448(%rsp,%rax,2), %eax
+; NOBW-NEXT: movzwl 1024(%rsp,%rax,2), %eax
; NOBW-NEXT: vmovd %eax, %xmm4
; NOBW-NEXT: vpextrw $1, %xmm2, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $1, 384(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrw $1, 1088(%rsp,%rax,2), %xmm4, %xmm4
; NOBW-NEXT: vpextrw $2, %xmm2, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $2, 320(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrw $2, 1152(%rsp,%rax,2), %xmm4, %xmm4
; NOBW-NEXT: vpextrw $3, %xmm2, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $3, 256(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrw $3, 1216(%rsp,%rax,2), %xmm4, %xmm4
; NOBW-NEXT: vpextrw $4, %xmm2, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $4, 192(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrw $4, 1280(%rsp,%rax,2), %xmm4, %xmm4
; NOBW-NEXT: vpextrw $5, %xmm2, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $5, 128(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrw $5, 1344(%rsp,%rax,2), %xmm4, %xmm4
; NOBW-NEXT: vpextrw $6, %xmm2, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $6, 64(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrw $6, 1408(%rsp,%rax,2), %xmm4, %xmm4
; NOBW-NEXT: vpextrw $7, %xmm2, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm4, %xmm2
+; NOBW-NEXT: vpinsrw $7, 1472(%rsp,%rax,2), %xmm4, %xmm2
; NOBW-NEXT: vmovd %xmm3, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: movzwl 960(%rsp,%rax,2), %eax
+; NOBW-NEXT: movzwl 512(%rsp,%rax,2), %eax
; NOBW-NEXT: vmovd %eax, %xmm4
; NOBW-NEXT: vpextrw $1, %xmm3, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $1, 896(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrw $1, 576(%rsp,%rax,2), %xmm4, %xmm4
; NOBW-NEXT: vpextrw $2, %xmm3, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $2, 832(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrw $2, 640(%rsp,%rax,2), %xmm4, %xmm4
; NOBW-NEXT: vpextrw $3, %xmm3, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $3, 768(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrw $3, 704(%rsp,%rax,2), %xmm4, %xmm4
; NOBW-NEXT: vpextrw $4, %xmm3, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $4, 704(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrw $4, 768(%rsp,%rax,2), %xmm4, %xmm4
; NOBW-NEXT: vpextrw $5, %xmm3, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $5, 640(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrw $5, 832(%rsp,%rax,2), %xmm4, %xmm4
; NOBW-NEXT: vpextrw $6, %xmm3, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $6, 576(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrw $6, 896(%rsp,%rax,2), %xmm4, %xmm4
; NOBW-NEXT: vpextrw $7, %xmm3, %eax
; NOBW-NEXT: andl $31, %eax
-; NOBW-NEXT: vpinsrw $7, 512(%rsp,%rax,2), %xmm4, %xmm3
-; NOBW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; NOBW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1
+; NOBW-NEXT: vpinsrw $7, 960(%rsp,%rax,2), %xmm4, %xmm3
+; NOBW-NEXT: vmovd %xmm1, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: movzwl (%rsp,%rax,2), %eax
+; NOBW-NEXT: vmovd %eax, %xmm4
+; NOBW-NEXT: vpextrw $1, %xmm1, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $1, 64(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $2, %xmm1, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $2, 128(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $3, %xmm1, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $3, 192(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $4, %xmm1, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $4, 256(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $5, %xmm1, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $5, 320(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $6, %xmm1, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $6, 384(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $7, %xmm1, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $7, 448(%rsp,%rax,2), %xmm4, %xmm1
+; NOBW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
+; NOBW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; NOBW-NEXT: movq %rbp, %rsp
; NOBW-NEXT: popq %rbp
; NOBW-NEXT: retq
; NOBW-NEXT: movq %rsp, %rbp
; NOBW-NEXT: andq $-64, %rsp
; NOBW-NEXT: subq $4160, %rsp # imm = 0x1040
+; NOBW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; NOBW-NEXT: vextracti128 $1, %ymm1, %xmm3
; NOBW-NEXT: vextracti128 $1, %ymm2, %xmm4
; NOBW-NEXT: vpextrb $0, %xmm4, %eax
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, (%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
-; NOBW-NEXT: vmovaps %ymm0, (%rsp)
-; NOBW-NEXT: movzbl 3008(%rsp,%rax), %eax
+; NOBW-NEXT: movzbl 3072(%rsp,%rax), %eax
; NOBW-NEXT: vmovd %eax, %xmm0
; NOBW-NEXT: vpextrb $1, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $1, 2944(%rsp,%rax), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $1, 3136(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $2, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $2, 2880(%rsp,%rax), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $2, 3200(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $3, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $3, 2816(%rsp,%rax), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $3, 3264(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $4, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $4, 2752(%rsp,%rax), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $4, 3328(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $5, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $5, 2688(%rsp,%rax), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $5, 3392(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $6, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $6, 2624(%rsp,%rax), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $6, 3456(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $7, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $7, 2560(%rsp,%rax), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $7, 3520(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $8, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $8, 2496(%rsp,%rax), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $8, 3584(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $9, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $9, 2432(%rsp,%rax), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $9, 3648(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $10, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $10, 2368(%rsp,%rax), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $10, 3712(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $11, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $11, 2304(%rsp,%rax), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $11, 3776(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $12, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $12, 2240(%rsp,%rax), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $12, 3840(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $13, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $13, 2176(%rsp,%rax), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $13, 3904(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $14, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $14, 2112(%rsp,%rax), %xmm0, %xmm0
+; NOBW-NEXT: vpinsrb $14, 3968(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $15, %xmm4, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $15, 2048(%rsp,%rax), %xmm0, %xmm0
-; NOBW-NEXT: vpextrb $0, %xmm2, %eax
-; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: movzbl 4032(%rsp,%rax), %eax
-; NOBW-NEXT: vmovd %eax, %xmm1
-; NOBW-NEXT: vpextrb $1, %xmm2, %eax
-; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $1, 3968(%rsp,%rax), %xmm1, %xmm1
-; NOBW-NEXT: vpextrb $2, %xmm2, %eax
-; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $2, 3904(%rsp,%rax), %xmm1, %xmm1
-; NOBW-NEXT: vpextrb $3, %xmm2, %eax
-; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $3, 3840(%rsp,%rax), %xmm1, %xmm1
-; NOBW-NEXT: vpextrb $4, %xmm2, %eax
-; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $4, 3776(%rsp,%rax), %xmm1, %xmm1
-; NOBW-NEXT: vpextrb $5, %xmm2, %eax
-; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $5, 3712(%rsp,%rax), %xmm1, %xmm1
-; NOBW-NEXT: vpextrb $6, %xmm2, %eax
-; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $6, 3648(%rsp,%rax), %xmm1, %xmm1
-; NOBW-NEXT: vpextrb $7, %xmm2, %eax
-; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $7, 3584(%rsp,%rax), %xmm1, %xmm1
-; NOBW-NEXT: vpextrb $8, %xmm2, %eax
-; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $8, 3520(%rsp,%rax), %xmm1, %xmm1
-; NOBW-NEXT: vpextrb $9, %xmm2, %eax
-; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $9, 3456(%rsp,%rax), %xmm1, %xmm1
-; NOBW-NEXT: vpextrb $10, %xmm2, %eax
-; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $10, 3392(%rsp,%rax), %xmm1, %xmm1
-; NOBW-NEXT: vpextrb $11, %xmm2, %eax
-; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $11, 3328(%rsp,%rax), %xmm1, %xmm1
-; NOBW-NEXT: vpextrb $12, %xmm2, %eax
-; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $12, 3264(%rsp,%rax), %xmm1, %xmm1
-; NOBW-NEXT: vpextrb $13, %xmm2, %eax
-; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $13, 3200(%rsp,%rax), %xmm1, %xmm1
-; NOBW-NEXT: vpextrb $14, %xmm2, %eax
-; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $14, 3136(%rsp,%rax), %xmm1, %xmm1
-; NOBW-NEXT: vpextrb $15, %xmm2, %eax
-; NOBW-NEXT: vextracti128 $1, %ymm3, %xmm2
-; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $15, 3072(%rsp,%rax), %xmm1, %xmm1
+; NOBW-NEXT: vpinsrb $15, 4032(%rsp,%rax), %xmm0, %xmm0
; NOBW-NEXT: vpextrb $0, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: movzbl 960(%rsp,%rax), %eax
+; NOBW-NEXT: movzbl 2048(%rsp,%rax), %eax
; NOBW-NEXT: vmovd %eax, %xmm4
; NOBW-NEXT: vpextrb $1, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $1, 896(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $1, 2112(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $2, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $2, 832(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $2, 2176(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $3, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $3, 768(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $3, 2240(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $4, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $4, 704(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $4, 2304(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $5, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $5, 640(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $5, 2368(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $6, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $6, 576(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $6, 2432(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $7, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $7, 512(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $7, 2496(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $8, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $8, 448(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $8, 2560(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $9, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $9, 384(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $9, 2624(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $10, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $10, 320(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $10, 2688(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $11, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $11, 256(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $11, 2752(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $12, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $12, 192(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $12, 2816(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $13, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $13, 128(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $13, 2880(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $14, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $14, 64(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $14, 2944(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $15, %xmm2, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $15, (%rsp,%rax), %xmm4, %xmm2
+; NOBW-NEXT: vpinsrb $15, 3008(%rsp,%rax), %xmm4, %xmm2
; NOBW-NEXT: vpextrb $0, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: movzbl 1984(%rsp,%rax), %eax
+; NOBW-NEXT: movzbl 1024(%rsp,%rax), %eax
; NOBW-NEXT: vmovd %eax, %xmm4
; NOBW-NEXT: vpextrb $1, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $1, 1920(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $1, 1088(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $2, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $2, 1856(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $2, 1152(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $3, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $3, 1792(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $3, 1216(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $4, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $4, 1728(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $4, 1280(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $5, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $5, 1664(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $5, 1344(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $6, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $6, 1600(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $6, 1408(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $7, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $7, 1536(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $7, 1472(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $8, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $8, 1472(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $8, 1536(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $9, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $9, 1408(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $9, 1600(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $10, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $10, 1344(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $10, 1664(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $11, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $11, 1280(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $11, 1728(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $12, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $12, 1216(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $12, 1792(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $13, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $13, 1152(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $13, 1856(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $14, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $14, 1088(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpinsrb $14, 1920(%rsp,%rax), %xmm4, %xmm4
; NOBW-NEXT: vpextrb $15, %xmm3, %eax
; NOBW-NEXT: andl $63, %eax
-; NOBW-NEXT: vpinsrb $15, 1024(%rsp,%rax), %xmm4, %xmm3
-; NOBW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; NOBW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1
+; NOBW-NEXT: vpinsrb $15, 1984(%rsp,%rax), %xmm4, %xmm3
+; NOBW-NEXT: vpextrb $0, %xmm1, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: movzbl (%rsp,%rax), %eax
+; NOBW-NEXT: vmovd %eax, %xmm4
+; NOBW-NEXT: vpextrb $1, %xmm1, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $1, 64(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $2, %xmm1, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $2, 128(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $3, %xmm1, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $3, 192(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $4, %xmm1, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $4, 256(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $5, %xmm1, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $5, 320(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $6, %xmm1, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $6, 384(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $7, %xmm1, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $7, 448(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $8, %xmm1, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $8, 512(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $9, %xmm1, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $9, 576(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $10, %xmm1, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $10, 640(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $11, %xmm1, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $11, 704(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $12, %xmm1, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $12, 768(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $13, %xmm1, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $13, 832(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $14, %xmm1, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $14, 896(%rsp,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $15, %xmm1, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vpinsrb $15, 960(%rsp,%rax), %xmm4, %xmm1
+; NOBW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
+; NOBW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; NOBW-NEXT: movq %rbp, %rsp
; NOBW-NEXT: popq %rbp
; NOBW-NEXT: retq
; SSE-NEXT: pmullw %xmm4, %xmm3
; SSE-NEXT: retq
;
-; AVX-LABEL: test7:
-; AVX: # %bb.0:
-; AVX-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
-; AVX-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX-NEXT: vpmullw %ymm2, %ymm0, %ymm0
-; AVX-NEXT: vpmullw %ymm2, %ymm1, %ymm1
-; AVX-NEXT: retq
+; AVX2-LABEL: test7:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test7:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
+; AVX512-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: retq
%shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
ret <32 x i16> %shl
}
;
; AVX512F-LABEL: test_bitreverse_v64i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
-; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; AVX512F-NEXT: vpshufb %ymm0, %ymm5, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
; AVX512F-NEXT: vpshufb %ymm1, %ymm5, %ymm1
; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpshufb %ymm0, %ymm5, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_bitreverse_v64i8:
;
; AVX512F-LABEL: test_bitreverse_v32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm4
+; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
-; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm2
-; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
; AVX512F-NEXT: vpshufb %ymm1, %ymm6, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm2
+; AVX512F-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_bitreverse_v32i16:
;
; AVX512F-LABEL: test_cmp_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: retq
;
; AVX512F-LABEL: test_cmp_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT: vmovdqa %xmm4, %xmm1
; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX512DQ-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm1
; AVX512DQ-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
; AVX512F-NEXT: vpsllvd %zmm7, %zmm8, %zmm7
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %ymm4, %ymm8, %ymm9
+; AVX512F-NEXT: vpsubw %ymm5, %ymm8, %ymm9
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm9, %zmm2, %zmm2
-; AVX512F-NEXT: vpord %zmm2, %zmm7, %zmm2
-; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm9, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm3, %zmm7, %zmm3
+; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm4, %ymm4
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm2
+; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512F-NEXT: vpsllvd %zmm4, %zmm5, %zmm4
; AVX512F-NEXT: vpsubw %ymm2, %ymm8, %ymm5
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm5, %zmm3, %zmm3
-; AVX512F-NEXT: vpord %zmm3, %zmm4, %zmm3
-; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm5, %zmm1, %zmm1
+; AVX512F-NEXT: vpord %zmm1, %zmm4, %zmm1
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v32i16:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
; AVX512VL-NEXT: vpsllvd %zmm7, %zmm8, %zmm7
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %ymm4, %ymm8, %ymm9
+; AVX512VL-NEXT: vpsubw %ymm5, %ymm8, %ymm9
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm9, %zmm2, %zmm2
-; AVX512VL-NEXT: vpord %zmm2, %zmm7, %zmm2
-; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm9, %zmm3, %zmm3
+; AVX512VL-NEXT: vpord %zmm3, %zmm7, %zmm3
+; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm4, %ymm4
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
-; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm2
+; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VL-NEXT: vpsllvd %zmm4, %zmm5, %zmm4
; AVX512VL-NEXT: vpsubw %ymm2, %ymm8, %ymm5
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm5, %zmm3, %zmm3
-; AVX512VL-NEXT: vpord %zmm3, %zmm4, %zmm3
-; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm5, %zmm1, %zmm1
+; AVX512VL-NEXT: vpord %zmm1, %zmm4, %zmm1
+; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v32i16:
define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm7
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm8
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm9
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm8
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm6
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm7
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm9
; AVX512F-NEXT: vpsllw $5, %ymm9, %ymm10
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm8, %ymm0, %ymm8
-; AVX512F-NEXT: vpsllw $2, %ymm8, %ymm11
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512F-NEXT: vpand %ymm4, %ymm11, %ymm11
+; AVX512F-NEXT: vpblendvb %ymm10, %ymm7, %ymm3, %ymm7
+; AVX512F-NEXT: vpsllw $2, %ymm7, %ymm11
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
+; AVX512F-NEXT: vpand %ymm6, %ymm11, %ymm11
; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm8, %ymm8
-; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm11
+; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm7, %ymm7
+; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm11
; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm8, %ymm10
-; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm11
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm8, %ymm11, %ymm11
+; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm7, %ymm10
+; AVX512F-NEXT: vpsrlw $4, %ymm8, %ymm11
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm7, %ymm11, %ymm11
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpsubb %ymm9, %ymm12, %ymm13
; AVX512F-NEXT: vpsllw $5, %ymm13, %ymm13
-; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm11
+; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm8, %ymm8
+; AVX512F-NEXT: vpsrlw $2, %ymm8, %ymm11
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm14 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX512F-NEXT: vpand %ymm14, %ymm11, %ymm11
; AVX512F-NEXT: vpaddb %ymm13, %ymm13, %ymm13
-; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm11
+; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm8, %ymm8
+; AVX512F-NEXT: vpsrlw $1, %ymm8, %ymm11
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm15, %ymm11, %ymm11
; AVX512F-NEXT: vpaddb %ymm13, %ymm13, %ymm13
-; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm2, %ymm2
-; AVX512F-NEXT: vpor %ymm2, %ymm10, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm13, %ymm11, %ymm8, %ymm8
+; AVX512F-NEXT: vpor %ymm8, %ymm10, %ymm8
; AVX512F-NEXT: vpxor %xmm10, %xmm10, %xmm10
; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm9, %ymm9
-; AVX512F-NEXT: vpblendvb %ymm9, %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
-; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm9, %ymm3, %ymm8, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm8
+; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4
+; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4
+; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm8
+; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm6
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm5
; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm2
-; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm7
-; AVX512F-NEXT: vpand %ymm4, %ymm7, %ymm4
-; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm4
-; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4
-; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4
-; AVX512F-NEXT: vpsubb %ymm5, %ymm12, %ymm6
+; AVX512F-NEXT: vpsubb %ymm2, %ymm12, %ymm6
; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm4
-; AVX512F-NEXT: vpand %ymm14, %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm5
+; AVX512F-NEXT: vpand %ymm14, %ymm5, %ymm5
; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm4
-; AVX512F-NEXT: vpand %ymm15, %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm5
+; AVX512F-NEXT: vpand %ymm15, %ymm5, %ymm5
; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm5, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm6
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VL-NEXT: vpand %ymm8, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm9
-; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm6
+; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm9
+; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm6
; AVX512VL-NEXT: vpsllw $2, %ymm6, %ymm10
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm11 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512VL-NEXT: vpand %ymm11, %ymm10, %ymm10
; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm10
; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6
-; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm9
+; AVX512VL-NEXT: vpsrlw $4, %ymm4, %ymm9
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VL-NEXT: vpand %ymm10, %ymm9, %ymm9
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm12 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %ymm4, %ymm12, %ymm13
+; AVX512VL-NEXT: vpsubb %ymm5, %ymm12, %ymm13
; AVX512VL-NEXT: vpsllw $5, %ymm13, %ymm13
-; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsrlw $2, %ymm2, %ymm9
+; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm9
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm14 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX512VL-NEXT: vpand %ymm14, %ymm9, %ymm9
; AVX512VL-NEXT: vpaddb %ymm13, %ymm13, %ymm13
-; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsrlw $1, %ymm2, %ymm9
+; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm9
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm15 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512VL-NEXT: vpand %ymm15, %ymm9, %ymm9
; AVX512VL-NEXT: vpaddb %ymm13, %ymm13, %ymm13
-; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm2, %ymm2
-; AVX512VL-NEXT: vpor %ymm2, %ymm6, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm13, %ymm9, %ymm4, %ymm4
+; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4
; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
-; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
-; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2
-; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm4
-; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm2
-; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm7
+; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT: vpand %ymm8, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm7
; AVX512VL-NEXT: vpand %ymm11, %ymm7, %ymm7
; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm2, %ymm2
-; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm7
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm7
; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm5
; AVX512VL-NEXT: vpand %ymm10, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsubb %ymm4, %ymm12, %ymm7
+; AVX512VL-NEXT: vpsubb %ymm2, %ymm12, %ymm7
; AVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlw $2, %ymm3, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $2, %ymm1, %ymm5
; AVX512VL-NEXT: vpand %ymm14, %ymm5, %ymm5
; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlw $1, %ymm3, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $1, %ymm1, %ymm5
; AVX512VL-NEXT: vpand %ymm15, %ymm5, %ymm5
; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v64i8:
define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastw %xmm4, %ymm4
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm6
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT: vpsllw %xmm5, %ymm4, %ymm6
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm4, %xmm7, %xmm7
+; AVX512F-NEXT: vpsubw %xmm2, %xmm7, %xmm7
; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpor %ymm2, %ymm6, %ymm2
-; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpsllw %xmm5, %ymm1, %ymm2
; AVX512F-NEXT: vpsrlw %xmm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpor %ymm3, %ymm6, %ymm3
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm4
+; AVX512F-NEXT: vpsrlw %xmm7, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastw %xmm4, %ymm4
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm6
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm5, %ymm4, %ymm6
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm4, %xmm7, %xmm7
+; AVX512VL-NEXT: vpsubw %xmm2, %xmm7, %xmm7
; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm7, %ymm2, %ymm2
-; AVX512VL-NEXT: vpor %ymm2, %ymm6, %ymm2
-; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
-; AVX512VL-NEXT: vpsllw %xmm5, %ymm1, %ymm2
; AVX512VL-NEXT: vpsrlw %xmm7, %ymm3, %ymm3
-; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpor %ymm3, %ymm6, %ymm3
+; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm4
+; AVX512VL-NEXT: vpsrlw %xmm7, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm6
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm9
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpsllw %xmm5, %ymm4, %ymm6
; AVX512F-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
; AVX512F-NEXT: vpsllw %xmm5, %xmm8, %xmm7
; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7
-; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm9
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm4, %xmm6, %xmm6
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm6, %ymm2, %ymm10
-; AVX512F-NEXT: vpsrlw %xmm6, %xmm8, %xmm2
-; AVX512F-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512F-NEXT: vpand %ymm2, %ymm10, %ymm8
-; AVX512F-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm10
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm3
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpsrlw %xmm3, %ymm9, %ymm9
+; AVX512F-NEXT: vpsrlw %xmm3, %xmm8, %xmm6
+; AVX512F-NEXT: vpsrlw $8, %xmm6, %xmm6
+; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
+; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm8
+; AVX512F-NEXT: vpor %ymm8, %ymm10, %ymm8
; AVX512F-NEXT: vpxor %xmm9, %xmm9, %xmm9
-; AVX512F-NEXT: vpcmpeqb %ymm9, %ymm4, %ymm4
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm0, %ymm8, %ymm0
-; AVX512F-NEXT: vpsllw %xmm5, %ymm1, %ymm5
+; AVX512F-NEXT: vpcmpeqb %ymm9, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4
+; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm5
; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512F-NEXT: vpsrlw %xmm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpor %ymm2, %ymm5, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm6
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm9
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm5, %ymm4, %ymm6
; AVX512VL-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
; AVX512VL-NEXT: vpsllw %xmm5, %xmm8, %xmm7
; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7
-; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm9
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm4, %xmm6, %xmm6
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm6, %ymm2, %ymm10
-; AVX512VL-NEXT: vpsrlw %xmm6, %xmm8, %xmm2
-; AVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VL-NEXT: vpand %ymm2, %ymm10, %ymm8
-; AVX512VL-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm10
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm3, %ymm9, %ymm9
+; AVX512VL-NEXT: vpsrlw %xmm3, %xmm8, %xmm6
+; AVX512VL-NEXT: vpsrlw $8, %xmm6, %xmm6
+; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
+; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm8
+; AVX512VL-NEXT: vpor %ymm8, %ymm10, %ymm8
; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9
-; AVX512VL-NEXT: vpcmpeqb %ymm9, %ymm4, %ymm4
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm8, %ymm0
-; AVX512VL-NEXT: vpsllw %xmm5, %ymm1, %ymm5
+; AVX512VL-NEXT: vpcmpeqb %ymm9, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4
+; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm5
; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsrlw %xmm6, %ymm3, %ymm3
-; AVX512VL-NEXT: vpand %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT: vpor %ymm2, %ymm5, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpand %ymm6, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v64i8:
define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
; AVX512F-LABEL: constant_funnnel_v32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
-; AVX512F-NEXT: vpmulhuw %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm5
-; AVX512F-NEXT: vpor %ymm2, %ymm5, %ymm2
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512F-NEXT: vpmulhuw %ymm4, %ymm3, %ymm2
-; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm3
-; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT: vpmulhuw %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm5
+; AVX512F-NEXT: vpor %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512F-NEXT: vpmulhuw %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm3
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v32i16:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
-; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm2, %ymm2
-; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm5
-; AVX512VL-NEXT: vpor %ymm2, %ymm5, %ymm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm3, %ymm2
-; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm3
-; AVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpmullw %ymm4, %ymm2, %ymm5
+; AVX512VL-NEXT: vpor %ymm3, %ymm5, %ymm3
+; AVX512VL-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm3
+; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v32i16:
define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-LABEL: constant_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm4
; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm7
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512F-NEXT: vpand %ymm8, %ymm7, %ymm7
; AVX512F-NEXT: vpackuswb %ymm11, %ymm2, %ymm2
; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
-; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm2
-; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
+; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm3
+; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm5
; AVX512F-NEXT: vpand %ymm8, %ymm5, %ymm5
-; AVX512F-NEXT: vpblendvb %ymm9, %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm5
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11],ymm3[12],ymm7[12],ymm3[13],ymm7[13],ymm3[14],ymm7[14],ymm3[15],ymm7[15],ymm3[24],ymm7[24],ymm3[25],ymm7[25],ymm3[26],ymm7[26],ymm3[27],ymm7[27],ymm3[28],ymm7[28],ymm3[29],ymm7[29],ymm3[30],ymm7[30],ymm3[31],ymm7[31]
+; AVX512F-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm10, %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15],ymm1[24],ymm7[24],ymm1[25],ymm7[25],ymm1[26],ymm7[26],ymm1[27],ymm7[27],ymm1[28],ymm7[28],ymm1[29],ymm7[29],ymm1[30],ymm7[30],ymm1[31],ymm7[31]
; AVX512F-NEXT: vpmullw %ymm12, %ymm5, %ymm5
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[16],ymm7[16],ymm3[17],ymm7[17],ymm3[18],ymm7[18],ymm3[19],ymm7[19],ymm3[20],ymm7[20],ymm3[21],ymm7[21],ymm3[22],ymm7[22],ymm3[23],ymm7[23]
-; AVX512F-NEXT: vpmullw %ymm13, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[16],ymm7[16],ymm1[17],ymm7[17],ymm1[18],ymm7[18],ymm1[19],ymm7[19],ymm1[20],ymm7[20],ymm1[21],ymm7[21],ymm1[22],ymm7[22],ymm1[23],ymm7[23]
+; AVX512F-NEXT: vpmullw %ymm13, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm4
; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm7
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512VL-NEXT: vpand %ymm8, %ymm7, %ymm7
; AVX512VL-NEXT: vpackuswb %ymm7, %ymm2, %ymm2
; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
-; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm2
-; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
+; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm5
; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm9, %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm10, %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT: vpblendvb %ymm9, %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm10, %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512VL-NEXT: vpmullw %ymm11, %ymm5, %ymm5
; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512VL-NEXT: vpmullw %ymm12, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512VL-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmullw %ymm12, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v64i8:
define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
; AVX512F-LABEL: splatconstant_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $9, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpsrlw $9, %ymm3, %ymm3
+; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2
+; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm1
; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $9, %ymm3, %ymm2
-; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $9, %ymm2, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT: vpsrlw $9, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsllw $7, %ymm2, %ymm2
+; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm1
; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $9, %ymm3, %ymm2
-; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-LABEL: splatconstant_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpandn %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpandn %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm2
-; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm3
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpandn %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT: vpandn %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm2
+; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpandn %ymm1, %ymm4, %ymm1
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm2
-; AVX512VL-NEXT: vpandn %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpsllvd %zmm5, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm0, %zmm5, %zmm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpsllvd %zmm5, %zmm2, %zmm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpord %zmm2, %zmm5, %zmm2
+; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
+; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpsllvd %zmm3, %zmm1, %zmm3
-; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v32i16:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm5, %zmm0, %zmm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %ymm2, %ymm6, %ymm2
+; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm0, %zmm5, %zmm0
-; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm5, %zmm2, %zmm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT: vpsubw %ymm3, %ymm6, %ymm3
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vpord %zmm2, %zmm5, %zmm2
+; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
+; AVX512VL-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm3, %zmm1, %zmm3
-; AVX512VL-NEXT: vpsubw %ymm2, %ymm6, %ymm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
-; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0
+; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v32i16:
define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6
+; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6
; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512F-NEXT: vpandn %ymm4, %ymm7, %ymm4
-; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm8
+; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm8
; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm8
; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm9
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm9
; AVX512F-NEXT: vpor %ymm4, %ymm9, %ymm4
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
-; AVX512F-NEXT: vpandn %ymm2, %ymm5, %ymm2
-; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
+; AVX512F-NEXT: vpandn %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $6, %ymm1, %ymm2
-; AVX512F-NEXT: vpandn %ymm2, %ymm7, %ymm2
-; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm4
+; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
+; AVX512F-NEXT: vpandn %ymm3, %ymm7, %ymm3
+; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2
-; AVX512F-NEXT: vpand %ymm8, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm4
-; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
+; AVX512F-NEXT: vpand %ymm8, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
+; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpandn %ymm4, %ymm5, %ymm4
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6
+; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm6
; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm6
; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512VL-NEXT: vpandn %ymm4, %ymm7, %ymm4
-; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm8
+; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm8
; AVX512VL-NEXT: vpand %ymm7, %ymm8, %ymm8
; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VL-NEXT: vpand %ymm8, %ymm4, %ymm4
-; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm9
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm9
; AVX512VL-NEXT: vpor %ymm4, %ymm9, %ymm4
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
-; AVX512VL-NEXT: vpandn %ymm2, %ymm5, %ymm2
-; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT: vpandn %ymm3, %ymm5, %ymm3
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw $6, %ymm1, %ymm2
-; AVX512VL-NEXT: vpandn %ymm2, %ymm7, %ymm2
-; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm4
+; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpand %ymm6, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
+; AVX512VL-NEXT: vpandn %ymm3, %ymm7, %ymm3
+; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2
-; AVX512VL-NEXT: vpand %ymm8, %ymm2, %ymm2
-; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm4
-; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
+; AVX512VL-NEXT: vpand %ymm8, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
+; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v64i8:
define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm2, %xmm5, %xmm2
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3
-; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
+; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm2, %xmm5, %xmm2
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
+; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6
; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm2, %xmm7, %xmm2
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
+; AVX512F-NEXT: vpsubb %xmm1, %xmm7, %xmm1
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
-; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3
+; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6
; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm2, %xmm7, %xmm2
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
+; AVX512VL-NEXT: vpsubb %xmm1, %xmm7, %xmm1
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
-; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3
+; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v64i8:
define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind {
; AVX512F-LABEL: constant_funnnel_v32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
-; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
-; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v32i16:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768>
-; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
-; AVX512VL-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
+; AVX512VL-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
-; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
+; AVX512VL-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v32i16:
define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind {
; AVX512F-LABEL: constant_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
; AVX512F-NEXT: # ymm10 = mem[0,1,0,1]
; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9
; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpackuswb %ymm9, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
+; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm9, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3
; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
; AVX512F-NEXT: vpmullw %ymm10, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
-; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
+; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
; AVX512VL-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512VL-NEXT: vpmullw %ymm9, %ymm5, %ymm5
; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512VL-NEXT: vpackuswb %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
-; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
+; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3
; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512VL-NEXT: vpmullw %ymm9, %ymm3, %ymm3
; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VL-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v64i8:
define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind {
; AVX512F-LABEL: splatconstant_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2
-; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm2
; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1
; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2
+; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2
-; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm2
; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1
; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2
+; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
; AVX512F-LABEL: splatconstant_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm5
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
; AVX512F-NEXT: vpsrlvd %zmm7, %zmm8, %zmm7
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %ymm4, %ymm8, %ymm9
+; AVX512F-NEXT: vpsubw %ymm5, %ymm8, %ymm9
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpsllvd %zmm9, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm7, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT: vpsllvd %zmm9, %zmm3, %zmm3
+; AVX512F-NEXT: vpord %zmm7, %zmm3, %zmm3
+; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm4, %ymm4
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm2
+; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512F-NEXT: vpsrlvd %zmm4, %zmm5, %zmm4
; AVX512F-NEXT: vpsubw %ymm2, %ymm8, %ymm5
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpsllvd %zmm5, %zmm1, %zmm1
-; AVX512F-NEXT: vpord %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpsllvd %zmm5, %zmm0, %zmm0
+; AVX512F-NEXT: vpord %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v32i16:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm7 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm8 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
; AVX512VL-NEXT: vpsrlvd %zmm7, %zmm8, %zmm7
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %ymm4, %ymm8, %ymm9
+; AVX512VL-NEXT: vpsubw %ymm5, %ymm8, %ymm9
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm9 = ymm9[0],zero,ymm9[1],zero,ymm9[2],zero,ymm9[3],zero,ymm9[4],zero,ymm9[5],zero,ymm9[6],zero,ymm9[7],zero,ymm9[8],zero,ymm9[9],zero,ymm9[10],zero,ymm9[11],zero,ymm9[12],zero,ymm9[13],zero,ymm9[14],zero,ymm9[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm9, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm7, %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm9, %zmm3, %zmm3
+; AVX512VL-NEXT: vpord %zmm7, %zmm3, %zmm3
+; AVX512VL-NEXT: vpmovdw %zmm3, %ymm3
; AVX512VL-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm4, %ymm4
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm2
+; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpand %ymm6, %ymm2, %ymm2
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm5, %zmm4
; AVX512VL-NEXT: vpsubw %ymm2, %ymm8, %ymm5
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm5, %zmm1, %zmm1
-; AVX512VL-NEXT: vpord %zmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm5, %zmm0, %zmm0
+; AVX512VL-NEXT: vpord %zmm4, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VL-NEXT: vpcmpeqw %ymm7, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v32i16:
define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm7
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm8
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm9
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm8
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm6
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm7
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm9
; AVX512F-NEXT: vpsllw $5, %ymm9, %ymm10
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm8, %ymm2, %ymm8
-; AVX512F-NEXT: vpsrlw $2, %ymm8, %ymm11
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpand %ymm4, %ymm11, %ymm11
+; AVX512F-NEXT: vpblendvb %ymm10, %ymm7, %ymm3, %ymm7
+; AVX512F-NEXT: vpsrlw $2, %ymm7, %ymm11
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm6, %ymm11, %ymm11
; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm8, %ymm11
+; AVX512F-NEXT: vpblendvb %ymm10, %ymm11, %ymm7, %ymm11
; AVX512F-NEXT: vpsrlw $1, %ymm11, %ymm12
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpand %ymm8, %ymm12, %ymm12
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm7, %ymm12, %ymm12
; AVX512F-NEXT: vpaddb %ymm10, %ymm10, %ymm10
; AVX512F-NEXT: vpblendvb %ymm10, %ymm12, %ymm11, %ymm10
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm11
+; AVX512F-NEXT: vpsllw $4, %ymm8, %ymm11
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm12, %ymm11, %ymm11
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpsubb %ymm9, %ymm13, %ymm14
; AVX512F-NEXT: vpsllw $5, %ymm14, %ymm14
-; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm0, %ymm0
-; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm11
+; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm8, %ymm8
+; AVX512F-NEXT: vpsllw $2, %ymm8, %ymm11
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512F-NEXT: vpand %ymm15, %ymm11, %ymm11
; AVX512F-NEXT: vpaddb %ymm14, %ymm14, %ymm14
-; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm11
+; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm8, %ymm8
+; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm11
; AVX512F-NEXT: vpaddb %ymm14, %ymm14, %ymm14
-; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm10, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm14, %ymm11, %ymm8, %ymm8
+; AVX512F-NEXT: vpor %ymm10, %ymm8, %ymm8
; AVX512F-NEXT: vpxor %xmm10, %xmm10, %xmm10
; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm9, %ymm9
-; AVX512F-NEXT: vpblendvb %ymm9, %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm2
-; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
-; AVX512F-NEXT: vpsllw $5, %ymm5, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm7
-; AVX512F-NEXT: vpand %ymm4, %ymm7, %ymm4
-; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm4
-; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm4
-; AVX512F-NEXT: vpand %ymm12, %ymm4, %ymm4
-; AVX512F-NEXT: vpsubb %ymm5, %ymm13, %ymm6
+; AVX512F-NEXT: vpblendvb %ymm9, %ymm3, %ymm8, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm8
+; AVX512F-NEXT: vpand %ymm4, %ymm8, %ymm4
+; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm4
+; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm8
+; AVX512F-NEXT: vpand %ymm6, %ymm8, %ymm6
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm6
+; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm6
+; AVX512F-NEXT: vpaddb %ymm5, %ymm5, %ymm5
+; AVX512F-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm5
+; AVX512F-NEXT: vpand %ymm12, %ymm5, %ymm5
+; AVX512F-NEXT: vpsubb %ymm2, %ymm13, %ymm6
; AVX512F-NEXT: vpsllw $5, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm4
-; AVX512F-NEXT: vpand %ymm15, %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm5
+; AVX512F-NEXT: vpand %ymm15, %ymm5, %ymm5
; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm5
; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm6
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm5, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpcmpeqb %ymm10, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm6
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm5
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm6
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm6
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VL-NEXT: vpand %ymm8, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm9
-; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm2, %ymm6
+; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsllw $5, %ymm5, %ymm9
+; AVX512VL-NEXT: vpblendvb %ymm9, %ymm6, %ymm3, %ymm6
; AVX512VL-NEXT: vpsrlw $2, %ymm6, %ymm10
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm11 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX512VL-NEXT: vpand %ymm11, %ymm10, %ymm10
; AVX512VL-NEXT: vpand %ymm12, %ymm10, %ymm10
; AVX512VL-NEXT: vpaddb %ymm9, %ymm9, %ymm9
; AVX512VL-NEXT: vpblendvb %ymm9, %ymm10, %ymm6, %ymm6
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm9
+; AVX512VL-NEXT: vpsllw $4, %ymm4, %ymm9
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm10, %ymm9, %ymm9
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm13 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %ymm4, %ymm13, %ymm14
+; AVX512VL-NEXT: vpsubb %ymm5, %ymm13, %ymm14
; AVX512VL-NEXT: vpsllw $5, %ymm14, %ymm14
-; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm9
+; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm9
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm15 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512VL-NEXT: vpand %ymm15, %ymm9, %ymm9
; AVX512VL-NEXT: vpaddb %ymm14, %ymm14, %ymm14
-; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm0, %ymm0
-; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm9
+; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm4, %ymm4
+; AVX512VL-NEXT: vpaddb %ymm4, %ymm4, %ymm9
; AVX512VL-NEXT: vpaddb %ymm14, %ymm14, %ymm14
-; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm6, %ymm0, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm14, %ymm9, %ymm4, %ymm4
+; AVX512VL-NEXT: vpor %ymm6, %ymm4, %ymm4
; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm2
-; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2
-; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm4
-; AVX512VL-NEXT: vpsllw $5, %ymm4, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT: vpsrlw $2, %ymm2, %ymm7
+; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm4
+; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT: vpand %ymm8, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm4, %ymm1, %ymm4
+; AVX512VL-NEXT: vpsrlw $2, %ymm4, %ymm7
; AVX512VL-NEXT: vpand %ymm11, %ymm7, %ymm7
; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsrlw $1, %ymm2, %ymm7
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw $1, %ymm4, %ymm7
; AVX512VL-NEXT: vpand %ymm12, %ymm7, %ymm7
; AVX512VL-NEXT: vpaddb %ymm5, %ymm5, %ymm5
-; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm5, %ymm7, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5
; AVX512VL-NEXT: vpand %ymm10, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsubb %ymm4, %ymm13, %ymm7
+; AVX512VL-NEXT: vpsubb %ymm2, %ymm13, %ymm7
; AVX512VL-NEXT: vpsllw $5, %ymm7, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm5
; AVX512VL-NEXT: vpand %ymm15, %ymm5, %ymm5
; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1
-; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm5
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm5
; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm7
-; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm7, %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v64i8:
define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastw %xmm4, %ymm4
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm5, %ymm2, %ymm6
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vpbroadcastw %xmm2, %ymm2
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512F-NEXT: vpsrlw %xmm5, %ymm4, %ymm6
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm4, %xmm7, %xmm7
+; AVX512F-NEXT: vpsubw %xmm2, %xmm7, %xmm7
; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm7, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm6, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw %xmm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpor %ymm6, %ymm3, %ymm3
; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw %xmm5, %ymm3, %ymm2
-; AVX512F-NEXT: vpsllw %xmm7, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw %xmm5, %ymm1, %ymm4
+; AVX512F-NEXT: vpsllw %xmm7, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastw %xmm4, %ymm4
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm5, %ymm2, %ymm6
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512VL-NEXT: vpbroadcastw %xmm2, %ymm2
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm5, %ymm4, %ymm6
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm4, %xmm7, %xmm7
+; AVX512VL-NEXT: vpsubw %xmm2, %xmm7, %xmm7
; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm7, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm6, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw %xmm7, %ymm3, %ymm3
+; AVX512VL-NEXT: vpor %ymm6, %ymm3, %ymm3
; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw %xmm5, %ymm3, %ymm2
-; AVX512VL-NEXT: vpsllw %xmm7, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw %xmm5, %ymm1, %ymm4
+; AVX512VL-NEXT: vpsllw %xmm7, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm5, %ymm2, %ymm6
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm9
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpsrlw %xmm5, %ymm4, %ymm6
; AVX512F-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
; AVX512F-NEXT: vpsrlw %xmm5, %xmm8, %xmm7
; AVX512F-NEXT: vpsrlw $8, %xmm7, %xmm7
; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7
-; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm9
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm4, %xmm6, %xmm6
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm6, %ymm0, %ymm10
-; AVX512F-NEXT: vpsllw %xmm6, %xmm8, %xmm0
-; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm8
-; AVX512F-NEXT: vpand %ymm8, %ymm10, %ymm0
-; AVX512F-NEXT: vpor %ymm9, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm10
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm3
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpsllw %xmm3, %ymm9, %ymm9
+; AVX512F-NEXT: vpsllw %xmm3, %xmm8, %xmm6
+; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
+; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm8
+; AVX512F-NEXT: vpor %ymm10, %ymm8, %ymm8
; AVX512F-NEXT: vpxor %xmm9, %xmm9, %xmm9
-; AVX512F-NEXT: vpcmpeqb %ymm9, %ymm4, %ymm4
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw %xmm5, %ymm3, %ymm2
-; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpsllw %xmm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm8, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpcmpeqb %ymm9, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4
+; AVX512F-NEXT: vpsrlw %xmm5, %ymm1, %ymm5
+; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm5, %ymm2, %ymm6
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm9
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm5, %ymm4, %ymm6
; AVX512VL-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
; AVX512VL-NEXT: vpsrlw %xmm5, %xmm8, %xmm7
; AVX512VL-NEXT: vpsrlw $8, %xmm7, %xmm7
; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7
-; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm9
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm4, %xmm6, %xmm6
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm6, %ymm0, %ymm10
-; AVX512VL-NEXT: vpsllw %xmm6, %xmm8, %xmm0
-; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm8
-; AVX512VL-NEXT: vpand %ymm8, %ymm10, %ymm0
-; AVX512VL-NEXT: vpor %ymm9, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm10
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm9, %ymm9
+; AVX512VL-NEXT: vpsllw %xmm3, %xmm8, %xmm6
+; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
+; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm8
+; AVX512VL-NEXT: vpor %ymm10, %ymm8, %ymm8
; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9
-; AVX512VL-NEXT: vpcmpeqb %ymm9, %ymm4, %ymm4
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw %xmm5, %ymm3, %ymm2
-; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsllw %xmm6, %ymm1, %ymm1
-; AVX512VL-NEXT: vpand %ymm8, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpcmpeqb %ymm9, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4
+; AVX512VL-NEXT: vpsrlw %xmm5, %ymm1, %ymm5
+; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm6, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v64i8:
define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
; AVX512F-LABEL: constant_funnnel_v32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
-; AVX512F-NEXT: vpmulhuw %ymm4, %ymm2, %ymm5
+; AVX512F-NEXT: vpmulhuw %ymm4, %ymm3, %ymm5
+; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpor %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT: vpmulhuw %ymm4, %ymm1, %ymm3
; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-NEXT: vpmulhuw %ymm4, %ymm3, %ymm2
-; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v32i16:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
-; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm2, %ymm5
+; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm3, %ymm5
+; AVX512VL-NEXT: vpmullw %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT: vpor %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm1, %ymm3
; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512VL-NEXT: vpmulhuw %ymm4, %ymm3, %ymm2
-; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v32i16:
define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-LABEL: constant_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
; AVX512F-NEXT: vpaddb %ymm6, %ymm6, %ymm8
-; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4
; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm9
-; AVX512F-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
; AVX512F-NEXT: vpmullw %ymm13, %ymm12, %ymm12
; AVX512F-NEXT: vpsrlw $8, %ymm12, %ymm12
; AVX512F-NEXT: vpackuswb %ymm10, %ymm12, %ymm10
-; AVX512F-NEXT: vpor %ymm10, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm10, %ymm3, %ymm3
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
-; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm2
-; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31]
-; AVX512F-NEXT: vpmullw %ymm11, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23]
+; AVX512F-NEXT: vpblendvb %ymm10, %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3
+; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3
+; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm9, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31]
+; AVX512F-NEXT: vpmullw %ymm11, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23]
; AVX512F-NEXT: vpmullw %ymm13, %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
; AVX512VL-NEXT: vpaddb %ymm6, %ymm6, %ymm8
-; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm4
; AVX512VL-NEXT: vpaddb %ymm8, %ymm8, %ymm9
-; AVX512VL-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm3
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
; AVX512VL-NEXT: vpmullw %ymm12, %ymm11, %ymm11
; AVX512VL-NEXT: vpsrlw $8, %ymm11, %ymm11
; AVX512VL-NEXT: vpackuswb %ymm4, %ymm11, %ymm4
-; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm4, %ymm3, %ymm3
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360]
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
-; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
-; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm2
-; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512VL-NEXT: vpmullw %ymm10, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3
+; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm6, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm3
+; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm9, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT: vpmullw %ymm10, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512VL-NEXT: vpmullw %ymm12, %ymm5, %ymm5
; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512VL-NEXT: vpackuswb %ymm2, %ymm5, %ymm2
-; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpackuswb %ymm3, %ymm5, %ymm3
+; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v64i8:
define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwind {
; AVX512F-LABEL: splatconstant_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm3
+; AVX512F-NEXT: vpsllw $9, %ymm2, %ymm2
+; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm1
; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm2
-; AVX512F-NEXT: vpsllw $9, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $7, %ymm2, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsllw $9, %ymm2, %ymm2
+; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm1
; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm2
-; AVX512VL-NEXT: vpsllw $9, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-LABEL: splatconstant_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $4, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpandn %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512F-NEXT: vpandn %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm2
-; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $4, %ymm2, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm3
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpandn %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT: vpandn %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpsllw $4, %ymm2, %ymm2
+; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpandn %ymm1, %ymm4, %ymm1
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm2
-; AVX512VL-NEXT: vpandn %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpsubw %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpsllvd %zmm6, %zmm0, %zmm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %ymm2, %ymm7, %ymm2
+; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm0, %zmm6, %zmm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpsubw %ymm3, %ymm4, %ymm2
-; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpsllvd %zmm6, %zmm2, %zmm6
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512F-NEXT: vpsubw %ymm3, %ymm7, %ymm3
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpord %zmm2, %zmm6, %zmm2
+; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT: vpsubw %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
+; AVX512F-NEXT: vpsubw %ymm1, %ymm7, %ymm1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpsllvd %zmm3, %zmm1, %zmm3
-; AVX512F-NEXT: vpsubw %ymm2, %ymm7, %ymm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v32i16:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT: vpsubw %ymm3, %ymm4, %ymm3
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm6, %zmm0, %zmm6
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %ymm2, %ymm7, %ymm2
+; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm6 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm0, %zmm6, %zmm0
-; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VL-NEXT: vpsubw %ymm3, %ymm4, %ymm2
-; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm6, %zmm2, %zmm6
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT: vpsubw %ymm3, %ymm7, %ymm3
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vpord %zmm2, %zmm6, %zmm2
+; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512VL-NEXT: vpsubw %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
+; AVX512VL-NEXT: vpsubw %ymm1, %ymm7, %ymm1
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm3, %zmm1, %zmm3
-; AVX512VL-NEXT: vpsubw %ymm2, %ymm7, %ymm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
-; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0
+; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v32i16:
define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6
+; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6
; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4
; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512F-NEXT: vpandn %ymm4, %ymm8, %ymm4
-; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm9
+; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm9
; AVX512F-NEXT: vpand %ymm8, %ymm9, %ymm9
; AVX512F-NEXT: vpor %ymm4, %ymm9, %ymm4
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpand %ymm9, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm10
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm10
; AVX512F-NEXT: vpor %ymm4, %ymm10, %ymm4
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
-; AVX512F-NEXT: vpandn %ymm2, %ymm5, %ymm2
-; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
+; AVX512F-NEXT: vpandn %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpsubb %ymm3, %ymm6, %ymm3
-; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $6, %ymm1, %ymm2
-; AVX512F-NEXT: vpandn %ymm2, %ymm8, %ymm2
-; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm4
+; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT: vpand %ymm7, %ymm1, %ymm1
+; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
+; AVX512F-NEXT: vpandn %ymm3, %ymm8, %ymm3
+; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4
-; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2
-; AVX512F-NEXT: vpand %ymm9, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm4
-; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
+; AVX512F-NEXT: vpand %ymm9, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
+; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpandn %ymm4, %ymm5, %ymm4
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6
+; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm6
; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm6
; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4
; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2
; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512VL-NEXT: vpandn %ymm4, %ymm8, %ymm4
-; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm9
+; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm9
; AVX512VL-NEXT: vpand %ymm8, %ymm9, %ymm9
; AVX512VL-NEXT: vpor %ymm4, %ymm9, %ymm4
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VL-NEXT: vpand %ymm9, %ymm4, %ymm4
-; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm10
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm10
; AVX512VL-NEXT: vpor %ymm4, %ymm10, %ymm4
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
-; AVX512VL-NEXT: vpandn %ymm2, %ymm5, %ymm2
-; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT: vpandn %ymm3, %ymm5, %ymm3
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpsubb %ymm3, %ymm6, %ymm3
-; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw $6, %ymm1, %ymm2
-; AVX512VL-NEXT: vpandn %ymm2, %ymm8, %ymm2
-; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm4
+; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpsubb %ymm1, %ymm6, %ymm1
+; AVX512VL-NEXT: vpand %ymm7, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
+; AVX512VL-NEXT: vpandn %ymm3, %ymm8, %ymm3
+; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
; AVX512VL-NEXT: vpand %ymm8, %ymm4, %ymm4
-; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2
-; AVX512VL-NEXT: vpand %ymm9, %ymm2, %ymm2
-; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm4
-; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
+; AVX512VL-NEXT: vpand %ymm9, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
+; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v64i8:
define <32 x i16> @splatvar_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpsubw %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4
+; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm2, %xmm5, %xmm2
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3
-; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
+; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpsubw %xmm2, %xmm3, %xmm2
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4
+; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm2, %xmm5, %xmm2
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
+; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind {
; AVX512F-LABEL: splatvar_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm2
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4
+; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6
; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm2, %xmm7, %xmm2
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
+; AVX512F-NEXT: vpsubb %xmm1, %xmm7, %xmm1
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
-; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3
+; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm2
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4
+; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6
; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm2, %xmm7, %xmm2
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
+; AVX512VL-NEXT: vpsubb %xmm1, %xmm7, %xmm1
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
-; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3
+; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v64i8:
define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x) nounwind {
; AVX512F-LABEL: constant_funnnel_v32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
-; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
-; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-NEXT: vpmullw %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v32i16:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
-; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
-; AVX512VL-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
+; AVX512VL-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [1,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
-; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX512VL-NEXT: vpmullw %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
+; AVX512VL-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v32i16:
define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x) nounwind {
; AVX512F-LABEL: constant_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
; AVX512F-NEXT: # ymm10 = mem[0,1,0,1]
; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9
; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpackuswb %ymm9, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
+; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm9, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3
; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
; AVX512F-NEXT: vpmullw %ymm10, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
-; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
+; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
; AVX512VL-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512VL-NEXT: vpmullw %ymm9, %ymm5, %ymm5
; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512VL-NEXT: vpackuswb %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
-; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
+; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3
; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512VL-NEXT: vpmullw %ymm9, %ymm3, %ymm3
; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VL-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_funnnel_v64i8:
define <32 x i16> @splatconstant_funnnel_v32i16(<32 x i16> %x) nounwind {
; AVX512F-LABEL: splatconstant_funnnel_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
-; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2
; AVX512F-NEXT: vpsllw $9, %ymm1, %ymm1
; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
+; AVX512F-NEXT: vpsllw $9, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2
-; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2
; AVX512VL-NEXT: vpsllw $9, %ymm1, %ymm1
; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2
+; AVX512VL-NEXT: vpsllw $9, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v32i16:
define <64 x i8> @splatconstant_funnnel_v64i8(<64 x i8> %x) nounwind {
; AVX512F-LABEL: splatconstant_funnnel_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_funnnel_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_funnnel_v64i8:
define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: test_div7_32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
-; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm3
-; AVX512F-NEXT: vpsraw $1, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $15, %ymm1, %ymm2
+; AVX512F-NEXT: vpsrlw $15, %ymm1, %ymm3
; AVX512F-NEXT: vpsraw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm2
+; AVX512F-NEXT: vpsraw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_div7_32i16:
define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: test_div7_64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm4
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm4
; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
+; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512F-NEXT: vpxor %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsubb %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpxor %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm7
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm7
; AVX512F-NEXT: vpmullw %ymm3, %ymm7, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2
+; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpxor %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpxor %ymm6, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsubb %ymm6, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_div7_64i8:
define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: test_divconstant_64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
-; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm4
; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
+; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpsraw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpsraw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb %ymm0, %ymm3, %ymm0
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
-; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
-; AVX512F-NEXT: vpand %ymm2, %ymm5, %ymm2
-; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT: vpand %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512F-NEXT: vpsraw $8, %ymm1, %ymm1
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT: vpsraw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpsraw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_divconstant_64i8:
define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: test_rem7_32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
-; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm3
; AVX512F-NEXT: vpsrlw $15, %ymm3, %ymm4
; AVX512F-NEXT: vpsraw $1, %ymm3, %ymm3
; AVX512F-NEXT: vpaddw %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpsrlw $15, %ymm2, %ymm3
; AVX512F-NEXT: vpsraw $1, %ymm2, %ymm2
; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_rem7_32i16:
define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: test_rem7_64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm4
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm4
; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
; AVX512F-NEXT: vpand %ymm8, %ymm4, %ymm4
; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm4
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm4
; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpackuswb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm3
; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm3
; AVX512F-NEXT: vpand %ymm8, %ymm3, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_rem7_64i8:
define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: test_remconstant_64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
-; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm4
; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm5
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm5
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4
; AVX512F-NEXT: vpaddb %ymm5, %ymm4, %ymm4
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
-; AVX512F-NEXT: vpand %ymm2, %ymm5, %ymm5
+; AVX512F-NEXT: vpand %ymm1, %ymm5, %ymm5
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
-; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpsubb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX512F-NEXT: vpsubb %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
-; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
-; AVX512F-NEXT: vpand %ymm2, %ymm5, %ymm5
+; AVX512F-NEXT: vpand %ymm1, %ymm5, %ymm5
; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5
; AVX512F-NEXT: vpmovsxbw %xmm5, %ymm5
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm6
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm6
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm6, %ymm6
; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6
; AVX512F-NEXT: vpackuswb %ymm5, %ymm6, %ymm5
; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm3
; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
-; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT: vpand %ymm1, %ymm4, %ymm4
; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
-; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_remconstant_64i8:
define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: test_div7_32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2
-; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_div7_32i16:
define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: test_div7_64i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm5
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
; AVX512F-NEXT: vpmullw %ymm4, %ymm6, %ymm6
; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpsubb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_div7_64i8:
define <64 x i8> @test_divconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: test_divconstant_64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
-; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
-; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
-; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
-; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_divconstant_64i8:
define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: test_rem7_32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
-; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm4
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm4
; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4
; AVX512F-NEXT: vpaddw %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2
-; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_rem7_32i16:
define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: test_rem7_64i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm5
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm5
+; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm5
; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm5
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
; AVX512F-NEXT: vpand %ymm8, %ymm7, %ymm7
; AVX512F-NEXT: vpsubb %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpsubb %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsllw $3, %ymm2, %ymm3
; AVX512F-NEXT: vpand %ymm8, %ymm3, %ymm3
; AVX512F-NEXT: vpsubb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_rem7_64i8:
define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: test_remconstant_64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
-; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[16],ymm1[16],ymm3[17],ymm1[17],ymm3[18],ymm1[18],ymm3[19],ymm1[19],ymm3[20],ymm1[20],ymm3[21],ymm1[21],ymm3[22],ymm1[22],ymm3[23],ymm1[23]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm4
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31]
+; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpsubb %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15],ymm4[24],ymm1[24],ymm4[25],ymm1[25],ymm4[26],ymm1[26],ymm4[27],ymm1[27],ymm4[28],ymm1[28],ymm4[29],ymm1[29],ymm4[30],ymm1[30],ymm4[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm3, %ymm4, %ymm4
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31]
-; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm5
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23]
+; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15],ymm4[24],ymm1[24],ymm4[25],ymm1[25],ymm4[26],ymm1[26],ymm4[27],ymm1[27],ymm4[28],ymm1[28],ymm4[29],ymm1[29],ymm4[30],ymm1[30],ymm4[31],ymm1[31]
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm2, %ymm5, %ymm5
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
-; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4
; AVX512F-NEXT: vpackuswb %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpsubb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
+; AVX512F-NEXT: vpsubb %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpsubb %ymm4, %ymm1, %ymm5
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15],ymm5[24],ymm2[24],ymm5[25],ymm2[25],ymm5[26],ymm2[26],ymm5[27],ymm2[27],ymm5[28],ymm2[28],ymm5[29],ymm2[29],ymm5[30],ymm2[30],ymm5[31],ymm2[31]
+; AVX512F-NEXT: vpsubb %ymm4, %ymm0, %ymm5
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15],ymm5[24],ymm1[24],ymm5[25],ymm1[25],ymm5[26],ymm1[26],ymm5[27],ymm1[27],ymm5[28],ymm1[28],ymm5[29],ymm1[29],ymm5[30],ymm1[30],ymm5[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm6, %ymm6
; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[16],ymm2[16],ymm5[17],ymm2[17],ymm5[18],ymm2[18],ymm5[19],ymm2[19],ymm5[20],ymm2[20],ymm5[21],ymm2[21],ymm5[22],ymm2[22],ymm5[23],ymm2[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[16],ymm1[16],ymm5[17],ymm1[17],ymm5[18],ymm1[18],ymm5[19],ymm1[19],ymm5[20],ymm1[20],ymm5[21],ymm1[21],ymm5[22],ymm1[22],ymm5[23],ymm1[23]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpackuswb %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpaddb %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15],ymm4[24],ymm2[24],ymm4[25],ymm2[25],ymm4[26],ymm2[26],ymm4[27],ymm2[27],ymm4[28],ymm2[28],ymm4[29],ymm2[29],ymm4[30],ymm2[30],ymm4[31],ymm2[31]
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15],ymm4[24],ymm1[24],ymm4[25],ymm1[25],ymm4[26],ymm1[26],ymm4[27],ymm1[27],ymm4[28],ymm1[28],ymm4[29],ymm1[29],ymm4[30],ymm1[30],ymm4[31],ymm1[31]
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm5, %ymm5
-; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm5
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23]
-; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpackuswb %ymm5, %ymm2, %ymm2
-; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm5, %ymm5
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23]
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_remconstant_64i8:
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CD-LABEL: testv32i16:
; AVX512CD: # %bb.0:
-; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512CD-NEXT: vpsubw %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512CD-NEXT: vpsubw %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv32i16:
;
; AVX512DQ-LABEL: testv32i16:
; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3
-; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
+; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm4
; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm1
; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpaddw %ymm0, %ymm3, %ymm0
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
-; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4
+; AVX512DQ-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3
+; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm5
; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX512DQ-NEXT: vpaddb %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpand %ymm0, %ymm2, %ymm0
; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpaddw %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT: vpaddw %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
%out = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %in, i1 0)
ret <32 x i16> %out
define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CD-LABEL: testv32i16u:
; AVX512CD: # %bb.0:
-; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512CD-NEXT: vpsubw %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512CD-NEXT: vpsubw %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv32i16u:
;
; AVX512DQ-LABEL: testv32i16u:
; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3
-; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
+; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm4
; AVX512DQ-NEXT: vpaddb %ymm4, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpand %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpand %ymm1, %ymm3, %ymm1
; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpaddw %ymm0, %ymm3, %ymm0
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
-; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4
+; AVX512DQ-NEXT: vpaddw %ymm1, %ymm3, %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3
+; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm5
; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX512DQ-NEXT: vpaddb %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpand %ymm0, %ymm2, %ymm0
; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpaddw %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT: vpaddw %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
%out = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %in, i1 -1)
ret <32 x i16> %out
define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CD-LABEL: testv64i8:
; AVX512CD: # %bb.0:
-; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
-; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2
-; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2
-; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
-; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512CD-NEXT: vpsubb %xmm3, %xmm0, %xmm0
-; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512CD-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2
; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2
; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1
; AVX512CD-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2
+; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm0, %xmm0
+; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv64i8:
;
; AVX512DQ-LABEL: testv64i8:
; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3
-; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm6
-; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0
-; AVX512DQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm4
-; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm6
+; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
; AVX512DQ-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3
+; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
%out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 0)
ret <64 x i8> %out
define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CD-LABEL: testv64i8u:
; AVX512CD: # %bb.0:
-; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
-; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2
-; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2
-; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
-; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512CD-NEXT: vpsubb %xmm3, %xmm0, %xmm0
-; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512CD-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2
; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24]
; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2
; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1
; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1
; AVX512CD-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2
+; AVX512CD-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
+; AVX512CD-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512CD-NEXT: vpsubb %xmm3, %xmm0, %xmm0
+; AVX512CD-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv64i8u:
;
; AVX512DQ-LABEL: testv64i8u:
; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3
-; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm6
-; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0
-; AVX512DQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm4
-; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm6
+; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
; AVX512DQ-NEXT: vpaddb %ymm1, %ymm3, %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm3
+; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT: vpaddb %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
%out = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %in, i1 -1)
ret <64 x i8> %out
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512F-LABEL: testv32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm3
-; AVX512F-NEXT: vpaddb %ymm0, %ymm3, %ymm0
-; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm2
-; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm3
+; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm2
+; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: testv32i16:
;
; AVX512VPOPCNTDQ-NOBW-LABEL: testv32i16:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm1, %zmm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: retq
;
; AVX512VPOPCNTDQ-BW-LABEL: testv32i16:
define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512F-LABEL: testv64i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: testv64i8:
;
; AVX512VPOPCNTDQ-NOBW-LABEL: testv64i8:
; AVX512VPOPCNTDQ-NOBW: # %bb.0:
+; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm3
-; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: retq
;
; AVX512VPOPCNTDQ-BW-LABEL: testv64i8:
;
; AVX512F-LABEL: trunc_v32i16_v32i1:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
;
; AVX512F-LABEL: trunc_v64i8_v64i1:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512F-NEXT: vpand %xmm2, %xmm3, %xmm2
;
; AVX512F-LABEL: icmp_v32i16_v32i1:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
;
; AVX512F-LABEL: icmp_v64i8_v64i1:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
;
; AVX512DQ-LABEL: test_v32i16:
; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
;
; AVX512DQVL-LABEL: test_v32i16:
; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
;
; AVX512DQ-LABEL: test_v64i8:
; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2
;
; AVX512DQVL-LABEL: test_v64i8:
; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2
;
; AVX512F-LABEL: trunc_v32i16_v32i1:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
;
; AVX512F-LABEL: trunc_v64i8_v64i1:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2
;
; AVX512F-LABEL: icmp_v32i16_v32i1:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
;
; AVX512F-LABEL: icmp_v64i8_v64i1:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
;
; AVX512F-LABEL: trunc_v32i16_v32i1:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
;
; AVX512F-LABEL: trunc_v64i8_v64i1:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512F-NEXT: vpxor %xmm2, %xmm3, %xmm2
;
; AVX512F-LABEL: icmp_v32i16_v32i1:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
;
; AVX512F-LABEL: icmp_v64i8_v64i1:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512F-LABEL: var_rotate_v32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpsllvd %zmm5, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm0, %zmm5, %zmm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512F-NEXT: vpsllvd %zmm5, %zmm2, %zmm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512F-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpord %zmm2, %zmm5, %zmm2
+; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512F-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
+; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpsllvd %zmm3, %zmm1, %zmm3
-; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpord %zmm0, %zmm3, %zmm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_rotate_v32i16:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VL-NEXT: vpand %ymm4, %ymm2, %ymm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm5, %zmm0, %zmm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %ymm2, %ymm6, %ymm2
+; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm0, %zmm5, %zmm0
-; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VL-NEXT: vpand %ymm4, %ymm3, %ymm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm5, %zmm2, %zmm5
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT: vpsubw %ymm3, %ymm6, %ymm3
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512VL-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vpord %zmm2, %zmm5, %zmm2
+; AVX512VL-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VL-NEXT: vpsllvd %zmm3, %zmm0, %zmm3
+; AVX512VL-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT: vpsllvd %zmm3, %zmm1, %zmm3
-; AVX512VL-NEXT: vpsubw %ymm2, %ymm6, %ymm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
-; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
-; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
-; AVX512VL-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpord %zmm0, %zmm3, %zmm0
+; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_rotate_v32i16:
define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512F-LABEL: var_rotate_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpsrlw $4, %ymm3, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpandn %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm6
+; AVX512F-NEXT: vpsllw $4, %ymm3, %ymm6
; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
; AVX512F-NEXT: vpor %ymm4, %ymm6, %ymm4
; AVX512F-NEXT: vpsllw $5, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $6, %ymm3, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512F-NEXT: vpandn %ymm4, %ymm6, %ymm4
-; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm7
+; AVX512F-NEXT: vpsllw $2, %ymm3, %ymm7
; AVX512F-NEXT: vpand %ymm6, %ymm7, %ymm7
; AVX512F-NEXT: vpor %ymm4, %ymm7, %ymm4
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm8
+; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm8
; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
-; AVX512F-NEXT: vpandn %ymm2, %ymm5, %ymm2
-; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm3
+; AVX512F-NEXT: vpandn %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $6, %ymm1, %ymm2
-; AVX512F-NEXT: vpandn %ymm2, %ymm6, %ymm2
-; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm4
+; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm3
+; AVX512F-NEXT: vpandn %ymm3, %ymm6, %ymm3
+; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2
-; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm4
-; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3
+; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm4
+; AVX512F-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_rotate_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-NEXT: vpsrlw $4, %ymm3, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpandn %ymm4, %ymm5, %ymm4
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm6
+; AVX512VL-NEXT: vpsllw $4, %ymm3, %ymm6
; AVX512VL-NEXT: vpand %ymm5, %ymm6, %ymm6
; AVX512VL-NEXT: vpor %ymm4, %ymm6, %ymm4
; AVX512VL-NEXT: vpsllw $5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $6, %ymm3, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512VL-NEXT: vpandn %ymm4, %ymm6, %ymm4
-; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm7
+; AVX512VL-NEXT: vpsllw $2, %ymm3, %ymm7
; AVX512VL-NEXT: vpand %ymm6, %ymm7, %ymm7
; AVX512VL-NEXT: vpor %ymm4, %ymm7, %ymm4
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4
-; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm8
+; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8
; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
-; AVX512VL-NEXT: vpandn %ymm2, %ymm5, %ymm2
-; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3
+; AVX512VL-NEXT: vpandn %ymm3, %ymm5, %ymm3
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpsllw $5, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw $6, %ymm1, %ymm2
-; AVX512VL-NEXT: vpandn %ymm2, %ymm6, %ymm2
-; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm4
+; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm3
+; AVX512VL-NEXT: vpandn %ymm3, %ymm6, %ymm3
+; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw $7, %ymm1, %ymm2
-; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2
-; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm4
-; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
+; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4
+; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_rotate_v64i8:
define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512F-LABEL: splatvar_rotate_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512F-NEXT: vpsubw %xmm2, %xmm5, %xmm2
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3
-; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpsubw %xmm1, %xmm5, %xmm1
+; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
+; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_rotate_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpsubw %xmm2, %xmm5, %xmm2
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpsubw %xmm1, %xmm5, %xmm1
+; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
+; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_rotate_v32i16:
define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512F-LABEL: splatvar_rotate_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpsllw %xmm3, %ymm2, %ymm4
; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6
; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm2, %xmm7, %xmm2
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
+; AVX512F-NEXT: vpsubb %xmm1, %xmm7, %xmm1
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
-; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT: vpsllw %xmm3, %ymm1, %ymm3
+; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpor %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm3
; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_rotate_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm2, %ymm4
; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6
; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm2, %xmm7, %xmm2
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
+; AVX512VL-NEXT: vpsubb %xmm1, %xmm7, %xmm1
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5
; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
-; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm1, %ymm3
+; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3
; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_rotate_v64i8:
define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: constant_rotate_v32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
-; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_rotate_v32i16:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
-; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
-; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
+; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_rotate_v32i16:
define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: constant_rotate_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpaddb %ymm7, %ymm7, %ymm8
; AVX512F-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
; AVX512F-NEXT: # ymm10 = mem[0,1,0,1]
; AVX512F-NEXT: vpmullw %ymm10, %ymm9, %ymm9
; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
; AVX512F-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpackuswb %ymm9, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
+; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm9, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm3
; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512F-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm3
; AVX512F-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15],ymm1[24],ymm5[24],ymm1[25],ymm5[25],ymm1[26],ymm5[26],ymm1[27],ymm5[27],ymm1[28],ymm5[28],ymm1[29],ymm5[29],ymm1[30],ymm5[30],ymm1[31],ymm5[31]
+; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm5[8],ymm0[9],ymm5[9],ymm0[10],ymm5[10],ymm0[11],ymm5[11],ymm0[12],ymm5[12],ymm0[13],ymm5[13],ymm0[14],ymm5[14],ymm0[15],ymm5[15],ymm0[24],ymm5[24],ymm0[25],ymm5[25],ymm0[26],ymm5[26],ymm0[27],ymm5[27],ymm0[28],ymm5[28],ymm0[29],ymm5[29],ymm0[30],ymm5[30],ymm0[31],ymm5[31]
; AVX512F-NEXT: vpmullw %ymm10, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[16],ymm5[16],ymm1[17],ymm5[17],ymm1[18],ymm5[18],ymm1[19],ymm5[19],ymm1[20],ymm5[20],ymm1[21],ymm5[21],ymm1[22],ymm5[22],ymm1[23],ymm5[23]
-; AVX512F-NEXT: vpmullw %ymm11, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[4],ymm5[4],ymm0[5],ymm5[5],ymm0[6],ymm5[6],ymm0[7],ymm5[7],ymm0[16],ymm5[16],ymm0[17],ymm5[17],ymm0[18],ymm5[18],ymm0[19],ymm5[19],ymm0[20],ymm5[20],ymm0[21],ymm5[21],ymm0[22],ymm5[22],ymm0[23],ymm5[23]
+; AVX512F-NEXT: vpmullw %ymm11, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_rotate_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512VL-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm5
; AVX512VL-NEXT: vpaddb %ymm7, %ymm7, %ymm8
; AVX512VL-NEXT: vpblendvb %ymm8, %ymm5, %ymm2, %ymm2
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
; AVX512VL-NEXT: # ymm9 = mem[0,1,0,1]
; AVX512VL-NEXT: vpmullw %ymm9, %ymm5, %ymm5
; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [256,2,4,8,16,32,64,128,256,2,4,8,16,32,64,128]
; AVX512VL-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512VL-NEXT: vpackuswb %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
-; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
+; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm2
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm3
; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm3
; AVX512VL-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512VL-NEXT: vpmullw %ymm9, %ymm3, %ymm3
; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VL-NEXT: vpmullw %ymm10, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VL-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmullw %ymm10, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_rotate_v64i8:
define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: splatconstant_rotate_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2
-; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm2
; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm1
; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $9, %ymm0, %ymm2
+; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_v32i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2
-; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm2
; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm1
; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $9, %ymm0, %ymm2
+; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_v32i16:
define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: splatconstant_rotate_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_v64i8:
define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: splatconstant_rotate_mask_v32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
-; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm3
-; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $11, %ymm1, %ymm3
; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512F-NEXT: vpor %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm3
+; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55]
-; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm3
-; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512VL-NEXT: vpsrlw $11, %ymm1, %ymm3
; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1
; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16:
define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: splatconstant_rotate_mask_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
-; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39]
-; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm4
-; AVX512F-NEXT: vpandn %ymm4, %ymm3, %ymm4
; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512F-NEXT: vpandn %ymm4, %ymm3, %ymm4
+; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2
-; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39]
-; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm4
-; AVX512VL-NEXT: vpandn %ymm4, %ymm3, %ymm4
; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1
; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm4, %ymm1, %ymm1
+; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39]
; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512VL-NEXT: vpandn %ymm4, %ymm3, %ymm4
+; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8:
;
; AVX512F-LABEL: sext_32i8_to_32i16:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm2
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1
-; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sext_32i8_to_32i16:
;
; AVX512F-LABEL: sext_32xi1_to_32xi8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
+; AVX512DQ-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512DQ-NEXT: vpsravd %zmm3, %zmm2, %zmm2
+; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT: vpsravd %zmm2, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1
-; AVX512DQ-NEXT: vpsravd %zmm2, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v32i16:
define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512DQ-NEXT: vpsllw $5, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15],ymm0[24],ymm3[24],ymm0[25],ymm3[25],ymm0[26],ymm3[26],ymm0[27],ymm3[27],ymm0[28],ymm3[28],ymm0[29],ymm3[29],ymm0[30],ymm3[30],ymm0[31],ymm3[31]
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
; AVX512DQ-NEXT: vpsraw $4, %ymm5, %ymm6
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm5
; AVX512DQ-NEXT: vpsraw $2, %ymm5, %ymm6
; AVX512DQ-NEXT: vpaddw %ymm4, %ymm4, %ymm4
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm6, %ymm5, %ymm4
; AVX512DQ-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[16],ymm3[16],ymm0[17],ymm3[17],ymm0[18],ymm3[18],ymm0[19],ymm3[19],ymm0[20],ymm3[20],ymm0[21],ymm3[21],ymm0[22],ymm3[22],ymm0[23],ymm3[23]
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512DQ-NEXT: vpsraw $4, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpackuswb %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsllw $5, %ymm3, %ymm2
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-NEXT: vpsraw $4, %ymm2, %ymm5
+; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpsraw $2, %ymm2, %ymm5
+; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpsraw $1, %ymm2, %ymm5
+; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512DQ-NEXT: vpsraw $4, %ymm4, %ymm5
; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4
; AVX512DQ-NEXT: vpsraw $2, %ymm4, %ymm5
; AVX512DQ-NEXT: vpaddw %ymm3, %ymm3, %ymm3
; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512DQ-NEXT: vpsraw $4, %ymm1, %ymm4
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpsraw $2, %ymm1, %ymm4
-; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpsraw $1, %ymm1, %ymm4
-; AVX512DQ-NEXT: vpaddw %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT: vpsraw $4, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsraw $2, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsraw $1, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpaddw %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v64i8:
define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512DQ-NEXT: vpsraw %xmm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsraw %xmm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512DQ-NEXT: vpsraw %xmm1, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpsraw %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v32i16:
define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3
-; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
-; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm4, %ymm4
+; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm4, %ymm4
+; AVX512DQ-NEXT: vpxor %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vpxor %ymm4, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsubb %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpxor %ymm4, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpsubb %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v64i8:
define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-LABEL: constant_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpsravd %zmm2, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpsravd %zmm2, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT: vpsravd %zmm2, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v32i16:
define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-LABEL: constant_shift_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512DQ-NEXT: vpsraw $8, %ymm2, %ymm2
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256]
; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512DQ-NEXT: vpsraw $8, %ymm0, %ymm0
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
-; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpmullw %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQ-NEXT: vpsraw $8, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512DQ-NEXT: vpsraw $8, %ymm1, %ymm1
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
+; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpmullw %ymm4, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512DQ-NEXT: vpsraw $8, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512DQ-NEXT: vpsraw $8, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmullw %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v64i8:
define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-LABEL: splatconstant_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsraw $3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vpsraw $3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpsraw $3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_shift_v32i16:
define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-LABEL: splatconstant_shift_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512DQ-NEXT: vpxor %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vpsrlw $3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512DQ-NEXT: vpxor %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpxor %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsubb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_shift_v64i8:
define <64 x i8> @ashr_const7_v64i8(<64 x i8> %a) {
; AVX512DQ-LABEL: ashr_const7_v64i8:
; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
; AVX512DQ-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: ashr_const7_v64i8:
define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v32i16:
; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512DQ-NEXT: vpsrlvd %zmm3, %zmm2, %zmm2
+; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512DQ-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v32i16:
define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT: vpsrlw $4, %ymm3, %ymm4
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpsrlw $2, %ymm3, %ymm4
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
; AVX512DQ-NEXT: vpand %ymm6, %ymm4, %ymm4
; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsrlw $1, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpsrlw $1, %ymm3, %ymm4
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512DQ-NEXT: vpand %ymm7, %ymm4, %ymm4
; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm2
-; AVX512DQ-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpsllw $5, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpsrlw $2, %ymm1, %ymm2
-; AVX512DQ-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpsrlw $1, %ymm1, %ymm2
-; AVX512DQ-NEXT: vpand %ymm7, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsrlw $2, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsrlw $1, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v64i8:
define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v32i16:
define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm3, %xmm3
; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3
+; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v64i8:
define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-LABEL: constant_shift_v32i16:
; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
-; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm1, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v32i16:
define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-LABEL: constant_shift_v64i8:
; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256]
; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpmullw %ymm4, %ymm3, %ymm3
; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpmullw %ymm5, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
-; AVX512DQ-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
; AVX512DQ-NEXT: vpmullw %ymm5, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX512DQ-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX512DQ-NEXT: vpmullw %ymm5, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v64i8:
define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-LABEL: splatconstant_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vpsrlw $3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_shift_v32i16:
define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-LABEL: splatconstant_shift_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vpsrlw $3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_shift_v64i8:
define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v32i16:
; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
+; AVX512DQ-NEXT: vpsllvd %zmm3, %zmm2, %zmm2
+; AVX512DQ-NEXT: vpmovdw %zmm2, %ymm2
+; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512DQ-NEXT: vpsllvd %zmm2, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v32i16:
define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm4
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT: vpsllw $4, %ymm3, %ymm4
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpsllw $2, %ymm3, %ymm4
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512DQ-NEXT: vpand %ymm6, %ymm4, %ymm4
; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpaddb %ymm3, %ymm3, %ymm4
; AVX512DQ-NEXT: vpaddb %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsllw $4, %ymm1, %ymm2
-; AVX512DQ-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpsllw $5, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpsllw $2, %ymm1, %ymm2
-; AVX512DQ-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm2
-; AVX512DQ-NEXT: vpaddb %ymm3, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2
+; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpand %ymm5, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v64i8:
define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX512DQ-NEXT: vpsllw %xmm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsllw %xmm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX512DQ-NEXT: vpsllw %xmm1, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpsllw %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v32i16:
define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpsllw %xmm1, %ymm2, %ymm2
; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpsllw %xmm2, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpsllw %xmm1, %xmm3, %xmm3
; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3
+; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsllw %xmm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpand %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v64i8:
define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-LABEL: constant_shift_v32i16:
; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
-; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v32i16:
define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-LABEL: constant_shift_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512DQ-NEXT: vpsllw $4, %ymm1, %ymm2
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
-; AVX512DQ-NEXT: vpand %ymm5, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpaddb %ymm4, %ymm4, %ymm6
-; AVX512DQ-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpaddb %ymm6, %ymm6, %ymm7
-; AVX512DQ-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsllw $4, %ymm1, %ymm2
-; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsllw $2, %ymm1, %ymm2
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512DQ-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpaddb %ymm4, %ymm4, %ymm6
; AVX512DQ-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb %ymm1, %ymm1, %ymm2
+; AVX512DQ-NEXT: vpaddb %ymm6, %ymm6, %ymm7
; AVX512DQ-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpaddb %ymm0, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v64i8:
define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-LABEL: splatconstant_shift_v32i16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vpsllw $3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_shift_v32i16:
define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-LABEL: splatconstant_shift_v64i8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
-; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vpsllw $3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_shift_v64i8:
; KNL-LABEL: shuffle_v32i16:
; KNL: ## %bb.0:
; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
-; KNL-NEXT: vmovdqa %ymm0, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16:
; KNL: ## %bb.0:
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
-; KNL-NEXT: vmovdqa %ymm0, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
; KNL: ## %bb.0:
-; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19]
-; KNL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31]
+; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19]
+; KNL-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31]
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u>
-; KNL-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
-; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7],ymm3[8,9,10,11,12,13,14],ymm1[15]
-; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19]
-; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; KNL-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm3
+; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7],ymm3[8,9,10,11,12,13,14],ymm0[15]
+; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19]
; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = <0,0,0,0,u,u,u,u,0,0,u,u,255,255,0,0,255,255,255,255,u,u,255,255,255,255,u,u,0,0,255,255>
-; KNL-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
+; KNL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
; KNL: ## %bb.0:
-; KNL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; KNL-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7],ymm1[8,9,10,11],ymm2[12,13],ymm1[14],ymm2[15]
-; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,u,u]
+; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; KNL-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; KNL-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6],ymm3[7],ymm2[8,9,10,11],ymm3[12,13],ymm2[14],ymm3[15]
+; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,u,u]
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm4
; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7],ymm0[8,9,10,11,12],ymm4[13,14,15]
; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17,u,u]
-; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; KNL-NEXT: vextracti128 $1, %ymm3, %xmm3
-; KNL-NEXT: vpbroadcastw %xmm3, %ymm3
-; KNL-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5,6],ymm3[7],ymm1[8,9,10,11,12,13,14],ymm3[15]
-; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; KNL-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7],ymm0[8],ymm3[9],ymm0[10],ymm3[11],ymm0[12],ymm3[13],ymm0[14],ymm3[15]
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
+; KNL-NEXT: vpbroadcastw %xmm1, %ymm1
+; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7],ymm3[8,9,10,11,12,13,14],ymm1[15]
+; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17]
; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
}
define <32 x i16> @shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u(<32 x i16> %a, <32 x i16> %b) {
-; KNL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
-; KNL-NEXT: retq
-;
-; SKX-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; SKX-NEXT: retq
+; ALL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u:
+; ALL: ## %bb.0:
+; ALL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; ALL-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <32 x i16> %c
}
define <32 x i16> @shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u(<32 x i16> %a, <32 x i16> %b) {
-; KNL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
-; KNL-NEXT: retq
-;
-; SKX-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; SKX-NEXT: retq
+; ALL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u:
+; ALL: ## %bb.0:
+; ALL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; ALL-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <32 x i16> %c
}
define <32 x i16> @shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z:
; KNL: ## %bb.0:
-; KNL-NEXT: vpsrld $16, %ymm0, %ymm0
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpsrld $16, %ymm1, %ymm1
+; KNL-NEXT: vpsrld $16, %ymm0, %ymm0
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z:
define <32 x i16> @shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30:
; KNL: ## %bb.0:
-; KNL-NEXT: vpslld $16, %ymm0, %ymm0
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpslld $16, %ymm1, %ymm1
+; KNL-NEXT: vpslld $16, %ymm0, %ymm0
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30:
define <32 x i16> @shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31:
; KNL: ## %bb.0:
-; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
+; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31:
define <32 x i16> @shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28:
; KNL: ## %bb.0:
-; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
+; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28:
define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
; KNL: ## %bb.0:
-; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
-; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
+; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
+; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
; KNL-NEXT: movl $65535, %eax ## imm = 0xFFFF
; KNL-NEXT: vmovd %eax, %xmm1
; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; KNL-LABEL: insert_dup_mem_v32i16_i32:
; KNL: ## %bb.0:
; KNL-NEXT: vpbroadcastw (%rdi), %ymm0
-; KNL-NEXT: vmovdqa %ymm0, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: insert_dup_mem_v32i16_i32:
; KNL-LABEL: insert_dup_mem_v32i16_sext_i16:
; KNL: ## %bb.0:
; KNL-NEXT: vpbroadcastw (%rdi), %ymm0
-; KNL-NEXT: vmovdqa %ymm0, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: insert_dup_mem_v32i16_sext_i16:
; KNL-LABEL: insert_dup_elt1_mem_v32i16_i32:
; KNL: ## %bb.0:
; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0
-; KNL-NEXT: vmovdqa %ymm0, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: insert_dup_elt1_mem_v32i16_i32:
; KNL-LABEL: insert_dup_elt3_mem_v32i16_i32:
; KNL: ## %bb.0:
; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0
-; KNL-NEXT: vmovdqa %ymm0, %ymm1
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: insert_dup_elt3_mem_v32i16_i32:
define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
; KNL: ## %bb.0:
-; KNL-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; KNL-NEXT: vmovdqa %ymm2, %ymm0
+; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; KNL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
define <32 x i16> @shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
; KNL: ## %bb.0:
-; KNL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; KNL-NEXT: vmovdqa %ymm2, %ymm0
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
define <8 x i16> @pr32967(<32 x i16> %v) {
; KNL-LABEL: pr32967:
; KNL: ## %bb.0:
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
; KNL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
define <32 x i16> @shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz:
; KNL: ## %bb.0:
-; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15],zero,zero,ymm0[10,11],zero,zero,ymm0[6,7],zero,zero,ymm0[2,3],zero,zero,ymm0[30,31],zero,zero,ymm0[26,27],zero,zero,ymm0[22,23],zero,zero,ymm0[18,19],zero,zero
+; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[14,15],zero,zero,ymm1[10,11],zero,zero,ymm1[6,7],zero,zero,ymm1[2,3],zero,zero,ymm1[30,31],zero,zero,ymm1[26,27],zero,zero,ymm1[22,23],zero,zero,ymm1[20,21],zero,zero
+; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15],zero,zero,ymm0[10,11],zero,zero,ymm0[6,7],zero,zero,ymm0[2,3],zero,zero,ymm0[30,31],zero,zero,ymm0[26,27],zero,zero,ymm0[22,23],zero,zero,ymm0[18,19],zero,zero
+; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz:
define <64 x i8> @shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
;
; AVX512DQ-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512F-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
;
; AVX512DQ-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
-; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
;
; AVX512DQ-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
-; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
; AVX512F-NEXT: movl $255, %eax
; AVX512F-NEXT: vmovd %eax, %xmm1
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; AVX512DQ-NEXT: movl $255, %eax
; AVX512DQ-NEXT: vmovd %eax, %xmm1
; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; AVX512F-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512DQ-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
;
; AVX512DQ-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
; AVX512F-LABEL: insert_dup_mem_v64i8_i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: insert_dup_mem_v64i8_i32:
; AVX512DQ-LABEL: insert_dup_mem_v64i8_i32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: insert_dup_mem_v64i8_i32:
; AVX512F-LABEL: insert_dup_mem_v64i8_sext_i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: insert_dup_mem_v64i8_sext_i8:
; AVX512DQ-LABEL: insert_dup_mem_v64i8_sext_i8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: insert_dup_mem_v64i8_sext_i8:
; AVX512F-LABEL: insert_dup_elt1_mem_v64i8_i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastb 1(%rdi), %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: insert_dup_elt1_mem_v64i8_i32:
; AVX512DQ-LABEL: insert_dup_elt1_mem_v64i8_i32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb 1(%rdi), %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: insert_dup_elt1_mem_v64i8_i32:
; AVX512F-LABEL: insert_dup_elt3_mem_v64i8_i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastb 3(%rdi), %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: insert_dup_elt3_mem_v64i8_i32:
; AVX512DQ-LABEL: insert_dup_elt3_mem_v64i8_i32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb 3(%rdi), %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: insert_dup_elt3_mem_v64i8_i32:
; AVX512F-NEXT: shrl $8, %eax
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
; AVX512DQ-NEXT: shrl $8, %eax
; AVX512DQ-NEXT: vmovd %eax, %xmm0
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
;
; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
;
; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
define <64 x i8> @shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
;
; AVX512DQ-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
-; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm1
-; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
+; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
;
; AVX512DQ-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm1
-; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
+; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
-; AVX512F-NEXT: vpshufb %ymm5, %ymm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpshufb %ymm5, %ymm0, %ymm1
-; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
+; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
;
; AVX512DQ-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
-; AVX512DQ-NEXT: vpshufb %ymm5, %ymm1, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpshufb %ymm5, %ymm0, %ymm1
-; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) {
; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vpackuswb %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
;
; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm1
+; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) {
; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm1
+; AVX512F-NEXT: vpackuswb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
;
; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm1
+; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm3, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_76_78_16_18_20_22_24_26_28_30_80_82_84_86_88_90_92_94_32_34_36_38_40_42_44_46_96_98_100_102_104_106_108_110_48_50_52_54_56_58_60_62_112_114_116_118_120_122_124_126:
;
; KNL64-LABEL: test_mm512_mask_blend_epi8:
; KNL64: # %bb.0: # %entry
-; KNL64-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; KNL64-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
-; KNL64-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
+; KNL64-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; KNL64-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; KNL64-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; KNL64-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm3
+; KNL64-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; KNL64-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; KNL64-NEXT: retq
;
; SKX32-LABEL: test_mm512_mask_blend_epi8:
;
; KNL32-LABEL: test_mm512_mask_blend_epi8:
; KNL32: # %bb.0: # %entry
-; KNL32-NEXT: pushl %ebp
-; KNL32-NEXT: .cfi_def_cfa_offset 8
-; KNL32-NEXT: .cfi_offset %ebp, -8
-; KNL32-NEXT: movl %esp, %ebp
-; KNL32-NEXT: .cfi_def_cfa_register %ebp
-; KNL32-NEXT: andl $-32, %esp
-; KNL32-NEXT: subl $32, %esp
-; KNL32-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; KNL32-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
-; KNL32-NEXT: vpblendvb %ymm3, 8(%ebp), %ymm1, %ymm1
-; KNL32-NEXT: movl %ebp, %esp
-; KNL32-NEXT: popl %ebp
-; KNL32-NEXT: .cfi_def_cfa %esp, 4
+; KNL32-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; KNL32-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; KNL32-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; KNL32-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm3
+; KNL32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; KNL32-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; KNL32-NEXT: retl
entry:
%0 = shufflevector <64 x i8> %A, <64 x i8> %W, <64 x i32> <i32 64, i32 1, i32 66, i32 3, i32 68, i32 5, i32 70, i32 7, i32 72, i32 9, i32 74, i32 11, i32 76, i32 13, i32 78, i32 15, i32 80, i32 17, i32 82, i32 19, i32 84, i32 21, i32 86, i32 23, i32 88, i32 25, i32 90, i32 27, i32 92, i32 29, i32 94, i32 31, i32 96, i32 33, i32 98, i32 35, i32 100, i32 37, i32 102, i32 39, i32 104, i32 41, i32 106, i32 43, i32 108, i32 45, i32 110, i32 47, i32 112, i32 49, i32 114, i32 51, i32 116, i32 53, i32 118, i32 55, i32 120, i32 57, i32 122, i32 59, i32 124, i32 61, i32 126, i32 63>
;
; KNL64-LABEL: test_mm512_mask_blend_epi16:
; KNL64: # %bb.0: # %entry
-; KNL64-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7],ymm2[8],ymm0[9],ymm2[10],ymm0[11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
-; KNL64-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7],ymm3[8],ymm1[9],ymm3[10],ymm1[11],ymm3[12],ymm1[13],ymm3[14],ymm1[15]
+; KNL64-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; KNL64-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; KNL64-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7],ymm3[8],ymm2[9],ymm3[10],ymm2[11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
+; KNL64-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
+; KNL64-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; KNL64-NEXT: retq
;
; SKX32-LABEL: test_mm512_mask_blend_epi16:
;
; KNL32-LABEL: test_mm512_mask_blend_epi16:
; KNL32: # %bb.0: # %entry
-; KNL32-NEXT: pushl %ebp
-; KNL32-NEXT: .cfi_def_cfa_offset 8
-; KNL32-NEXT: .cfi_offset %ebp, -8
-; KNL32-NEXT: movl %esp, %ebp
-; KNL32-NEXT: .cfi_def_cfa_register %ebp
-; KNL32-NEXT: andl $-32, %esp
-; KNL32-NEXT: subl $32, %esp
-; KNL32-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7],ymm2[8],ymm0[9],ymm2[10],ymm0[11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
-; KNL32-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1],mem[2],ymm1[3],mem[4],ymm1[5],mem[6],ymm1[7],mem[8],ymm1[9],mem[10],ymm1[11],mem[12],ymm1[13],mem[14],ymm1[15]
-; KNL32-NEXT: movl %ebp, %esp
-; KNL32-NEXT: popl %ebp
-; KNL32-NEXT: .cfi_def_cfa %esp, 4
+; KNL32-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; KNL32-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; KNL32-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7],ymm3[8],ymm2[9],ymm3[10],ymm2[11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
+; KNL32-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
+; KNL32-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; KNL32-NEXT: retl
entry:
%0 = shufflevector <32 x i16> %A, <32 x i16> %W, <32 x i32> <i32 32, i32 1, i32 34, i32 3, i32 36, i32 5, i32 38, i32 7, i32 40, i32 9, i32 42, i32 11, i32 44, i32 13, i32 46, i32 15, i32 48, i32 17, i32 50, i32 19, i32 52, i32 21, i32 54, i32 23, i32 56, i32 25, i32 58, i32 27, i32 60, i32 29, i32 62, i32 31>
define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16(<32 x i16> %a, <32 x i16> %c, <32 x i16> %d) {
; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm1, %ymm0
+; AVX512F-NEXT: vpcmpeqw %ymm6, %ymm5, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
-; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6
+; AVX512F-NEXT: vpermi2d %zmm0, %zmm5, %zmm6
; AVX512F-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm5
; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm1, %ymm0
+; AVX512VL-NEXT: vpcmpeqw %ymm6, %ymm5, %ymm0
; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm6 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
-; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm6
+; AVX512VL-NEXT: vpermi2d %zmm0, %zmm5, %zmm6
; AVX512VL-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512VL-NEXT: vpmovdw %zmm0, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
-; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
+; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3
+; AVX512VL-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) {
; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm5
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm6
; AVX512F-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm1
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
-; AVX512F-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm4
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm3, %ymm5
; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm6
; AVX512VL-NEXT: vptestmd %zmm6, %zmm6, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512VL-NEXT: vpmovdw %zmm0, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm0
-; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm1
+; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
;
; AVX512F-LABEL: trunc_packus_v32i16_v32i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_packus_v32i16_v32i8:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-NEXT: retq
;
; AVX512F-LABEL: trunc_ssat_v32i16_v32i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_ssat_v32i16_v32i8:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-NEXT: retq
;
; AVX512F-LABEL: trunc_usat_v32i16_v32i8:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpminuw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpminuw %ymm2, %ymm0, %ymm0
;
; AVX512VL-LABEL: trunc_usat_v32i16_v32i8:
; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpminuw %ymm2, %ymm1, %ymm1
; AVX512VL-NEXT: vpminuw %ymm2, %ymm0, %ymm0
;
; AVX512F-LABEL: trunc32i16_32i8:
; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512F-NEXT: vpmovdb %zmm1, (%rax)
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
;
; AVX512VL-LABEL: trunc32i16_32i8:
; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512VL-NEXT: vpmovdb %zmm1, (%rax)
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CD-LABEL: testv32i16:
; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm3
-; AVX512CD-NEXT: vpandn %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm3
+; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm4
+; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4
-; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0
-; AVX512CD-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm4
-; AVX512CD-NEXT: vpaddb %ymm0, %ymm4, %ymm0
-; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm2
-; AVX512CD-NEXT: vpandn %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm2
-; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2
; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1
-; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm2
-; AVX512CD-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm4
+; AVX512CD-NEXT: vpaddb %ymm1, %ymm4, %ymm1
; AVX512CD-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm2
+; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm2
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm2
+; AVX512CD-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv32i16:
;
; AVX512VPOPCNTDQ-LABEL: testv32i16:
; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm3
+; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
;
; BITALG-LABEL: testv32i16:
define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CD-LABEL: testv32i16u:
; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm3
-; AVX512CD-NEXT: vpandn %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm3
+; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm4
+; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4
-; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0
-; AVX512CD-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm4
-; AVX512CD-NEXT: vpaddb %ymm0, %ymm4, %ymm0
-; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512CD-NEXT: vpaddw %ymm2, %ymm1, %ymm2
-; AVX512CD-NEXT: vpandn %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm2
-; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2
; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1
-; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm2
-; AVX512CD-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpsllw $8, %ymm1, %ymm4
+; AVX512CD-NEXT: vpaddb %ymm1, %ymm4, %ymm1
; AVX512CD-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512CD-NEXT: vpaddw %ymm2, %ymm0, %ymm2
+; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm2
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpsllw $8, %ymm0, %ymm2
+; AVX512CD-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512CD-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv32i16u:
;
; AVX512VPOPCNTDQ-LABEL: testv32i16u:
; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm3
+; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT: vpaddw %ymm2, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
;
; BITALG-LABEL: testv32i16u:
define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CD-LABEL: testv64i8:
; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm3
-; AVX512CD-NEXT: vpandn %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm3
+; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm4
+; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4
-; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0
-; AVX512CD-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm2
-; AVX512CD-NEXT: vpandn %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm2
-; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2
; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1
-; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm2
+; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm2
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv64i8:
;
; AVX512VPOPCNTDQ-LABEL: testv64i8:
; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm3
+; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm4
+; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
;
; BITALG-LABEL: testv64i8:
define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CD-LABEL: testv64i8u:
; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm3
-; AVX512CD-NEXT: vpandn %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm3
+; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm4
+; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4
; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4
-; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0
-; AVX512CD-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm2
-; AVX512CD-NEXT: vpandn %ymm2, %ymm1, %ymm1
-; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm2
-; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2
; AVX512CD-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512CD-NEXT: vpshufb %ymm1, %ymm5, %ymm1
-; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm4, %ymm1, %ymm1
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm2
+; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm2
+; AVX512CD-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX512CD-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512CD-NEXT: vpshufb %ymm0, %ymm5, %ymm0
+; AVX512CD-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512CD-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv64i8u:
;
; AVX512VPOPCNTDQ-LABEL: testv64i8u:
; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm3
-; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm3
+; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm4
+; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4
-; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm1, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm2
-; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2
; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1
-; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT: vpandn %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX512VPOPCNTDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
;
; BITALG-LABEL: testv64i8u:
;
; AVX512F-LABEL: zext_32i8_to_32i16:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: zext_32i8_to_32i16:
;
; AVX512F-LABEL: test_abs_lt_v64i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpabsb %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x1c,0xc0]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc1,0x01]
; AVX512F-NEXT: vpabsb %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x7d,0x1c,0xc9]
+; AVX512F-NEXT: vpabsb %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x1c,0xc0]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc1,0x01]
; AVX512F-NEXT: retq # encoding: [0xc3]
;
; AVX512BW-LABEL: test_abs_lt_v64i8:
;
; AVX512F-LABEL: test_abs_gt_v32i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpabsw %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x1d,0xc0]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x48,0x3b,0xc1,0x01]
; AVX512F-NEXT: vpabsw %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x7d,0x1d,0xc9]
+; AVX512F-NEXT: vpabsw %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x1d,0xc0]
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc1,0x01]
; AVX512F-NEXT: retq # encoding: [0xc3]
;
; AVX512BW-LABEL: test_abs_gt_v32i16: