ExtSizeInBits = SizeInBits / 4;
switch (Opc) {
+ // Byte shifts by immediate.
+ case X86ISD::VSHLDQ:
+ case X86ISD::VSRLDQ:
+ // Shift by uniform.
+ case X86ISD::VSHL:
+ case X86ISD::VSRL:
+ case X86ISD::VSRA:
+ // Shift by immediate.
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI:
+ case X86ISD::VSRAI: {
+ SDLoc DL(Op);
+ SDValue Ext0 =
+ extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
+ SDValue ExtOp =
+ TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
+ SDValue UndefVec = TLO.DAG.getUNDEF(VT);
+ SDValue Insert =
+ insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
+ return TLO.CombineTo(Op, Insert);
+ }
// Target Shuffles.
case X86ISD::PSHUFB:
case X86ISD::UNPCKL:
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: combine_vec_udiv_nonuniform4:
-; AVX1: # %bb.0:
-; AVX1-NEXT: movl $171, %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2
-; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_vec_udiv_nonuniform4:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: movl $171, %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT: vpackuswb %xmm0, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX-LABEL: combine_vec_udiv_nonuniform4:
+; AVX: # %bb.0:
+; AVX-NEXT: movl $171, %eax
+; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX-NEXT: vpmullw %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX-NEXT: vpsllw $1, %xmm1, %xmm2
+; AVX-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX-NEXT: vpackuswb %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
;
; XOP-LABEL: combine_vec_udiv_nonuniform4:
; XOP: # %bb.0:
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpsllw %xmm3, %ymm0, %ymm4
-; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
-; AVX2-NEXT: vpsllw %xmm3, %ymm5, %ymm3
+; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX2-NEXT: vpsllw %xmm3, %xmm5, %xmm3
; AVX2-NEXT: vpbroadcastb %xmm3, %ymm3
; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw %xmm4, %ymm5, %ymm4
-; AVX2-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX2-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
+; AVX2-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4
; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4
-; AVX512F-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
-; AVX512F-NEXT: vpsllw %xmm3, %ymm5, %ymm3
+; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm3
; AVX512F-NEXT: vpbroadcastb %xmm3, %ymm3
; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw %xmm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
+; AVX512F-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4
; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4
-; AVX512VL-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm5, %ymm3
+; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm3
; AVX512VL-NEXT: vpbroadcastb %xmm3, %ymm3
; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlw %xmm4, %ymm5, %ymm4
-; AVX512VL-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512VL-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
+; AVX512VL-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4
; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsllw %xmm5, %ymm0, %ymm6
-; AVX512F-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9
-; AVX512F-NEXT: vpsllw %xmm5, %ymm9, %ymm8
-; AVX512F-NEXT: vpbroadcastb %xmm8, %ymm8
-; AVX512F-NEXT: vpand %ymm8, %ymm6, %ymm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm4, %xmm7, %xmm7
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsrlw %xmm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw %xmm7, %ymm9, %ymm9
-; AVX512F-NEXT: vpsrlw $8, %ymm9, %ymm9
-; AVX512F-NEXT: vpbroadcastb %xmm9, %ymm9
-; AVX512F-NEXT: vpand %ymm9, %ymm2, %ymm2
-; AVX512F-NEXT: vpor %ymm2, %ymm6, %ymm2
-; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpsllw %xmm5, %ymm1, %ymm2
-; AVX512F-NEXT: vpand %ymm8, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw %xmm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpand %ymm9, %ymm3, %ymm3
-; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
+; AVX512F-NEXT: vpsllw %xmm5, %xmm8, %xmm7
+; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7
+; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm9
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpsubb %xmm4, %xmm6, %xmm6
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpsrlw %xmm6, %ymm2, %ymm10
+; AVX512F-NEXT: vpsrlw %xmm6, %xmm8, %xmm2
+; AVX512F-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
+; AVX512F-NEXT: vpand %ymm2, %ymm10, %ymm8
+; AVX512F-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX512F-NEXT: vpxor %xmm9, %xmm9, %xmm9
+; AVX512F-NEXT: vpcmpeqb %ymm9, %ymm4, %ymm4
+; AVX512F-NEXT: vpblendvb %ymm4, %ymm0, %ymm8, %ymm0
+; AVX512F-NEXT: vpsllw %xmm5, %ymm1, %ymm5
+; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512F-NEXT: vpsrlw %xmm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpor %ymm2, %ymm5, %ymm2
; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: retq
;
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm6
-; AVX512VL-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9
-; AVX512VL-NEXT: vpsllw %xmm5, %ymm9, %ymm8
-; AVX512VL-NEXT: vpbroadcastb %xmm8, %ymm8
-; AVX512VL-NEXT: vpand %ymm8, %ymm6, %ymm6
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm4, %xmm7, %xmm7
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm7, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsrlw %xmm7, %ymm9, %ymm9
-; AVX512VL-NEXT: vpsrlw $8, %ymm9, %ymm9
-; AVX512VL-NEXT: vpbroadcastb %xmm9, %ymm9
-; AVX512VL-NEXT: vpand %ymm9, %ymm2, %ymm2
-; AVX512VL-NEXT: vpor %ymm2, %ymm6, %ymm2
-; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
-; AVX512VL-NEXT: vpsllw %xmm5, %ymm1, %ymm2
-; AVX512VL-NEXT: vpand %ymm8, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsrlw %xmm7, %ymm3, %ymm3
-; AVX512VL-NEXT: vpand %ymm9, %ymm3, %ymm3
-; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
+; AVX512VL-NEXT: vpsllw %xmm5, %xmm8, %xmm7
+; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7
+; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm9
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT: vpsubb %xmm4, %xmm6, %xmm6
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsrlw %xmm6, %ymm2, %ymm10
+; AVX512VL-NEXT: vpsrlw %xmm6, %xmm8, %xmm2
+; AVX512VL-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
+; AVX512VL-NEXT: vpand %ymm2, %ymm10, %ymm8
+; AVX512VL-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9
+; AVX512VL-NEXT: vpcmpeqb %ymm9, %ymm4, %ymm4
+; AVX512VL-NEXT: vpblendvb %ymm4, %ymm0, %ymm8, %ymm0
+; AVX512VL-NEXT: vpsllw %xmm5, %ymm1, %ymm5
+; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsrlw %xmm6, %ymm3, %ymm3
+; AVX512VL-NEXT: vpand %ymm2, %ymm3, %ymm2
+; AVX512VL-NEXT: vpor %ymm2, %ymm5, %ymm2
; AVX512VL-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
; AVX512VL-NEXT: retq
;
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
-; AVX512BW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
-; AVX512BW-NEXT: vpsllw %xmm3, %zmm5, %zmm3
+; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512BW-NEXT: vpsllw %xmm3, %xmm5, %xmm3
; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpsrlw %xmm4, %zmm5, %zmm4
-; AVX512BW-NEXT: vpsrlw $8, %zmm4, %zmm4
+; AVX512BW-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
+; AVX512BW-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512BW-NEXT: vpandq %zmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4
-; AVX512VBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm5, %zmm3
+; AVX512VBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VBMI2-NEXT: vpsllw %xmm3, %xmm5, %xmm3
; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm5, %zmm4
-; AVX512VBMI2-NEXT: vpsrlw $8, %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
+; AVX512VBMI2-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1
; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
-; AVX512VLBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm5, %zmm3
+; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm5, %xmm3
; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm5, %zmm4
-; AVX512VLBW-NEXT: vpsrlw $8, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
+; AVX512VLBW-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VLBW-NEXT: vpandq %zmm4, %zmm1, %zmm1
; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4
-; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm5, %zmm3
+; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %xmm5, %xmm3
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm5, %zmm4
-; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
+; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1
; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm3
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpsllw %xmm2, %ymm4, %ymm2
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpsllw %xmm2, %xmm4, %xmm2
; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw %xmm1, %ymm4, %ymm1
-; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
+; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX512F-NEXT: vpsllw %xmm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT: vpsllw %xmm2, %xmm4, %xmm2
; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw %xmm1, %ymm4, %ymm1
-; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
+; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm3
-; AVX512VL-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsllw %xmm2, %ymm4, %ymm2
+; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT: vpsllw %xmm2, %xmm4, %xmm2
; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX512VL-NEXT: vpand %ymm2, %ymm3, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw %xmm1, %ymm4, %ymm1
-; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
+; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4
-; AVX512F-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
-; AVX512F-NEXT: vpsllw %xmm3, %ymm5, %ymm6
+; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6
; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpsubb %xmm2, %xmm7, %xmm2
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw %xmm2, %ymm5, %ymm5
-; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
+; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4
-; AVX512VL-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm5, %ymm6
+; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6
; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VL-NEXT: vpsubb %xmm2, %xmm7, %xmm2
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
+; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
-; AVX512BW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
-; AVX512BW-NEXT: vpsllw %xmm3, %zmm5, %zmm3
+; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512BW-NEXT: vpsllw %xmm3, %xmm5, %xmm3
; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw %xmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlw %xmm1, %xmm5, %xmm1
+; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
-; AVX512VLBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm5, %zmm3
+; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm5, %xmm3
; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm5, %xmm1
+; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1
; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpsrlw %xmm3, %ymm1, %ymm4
-; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
-; AVX2-NEXT: vpsrlw %xmm3, %ymm5, %ymm3
-; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX2-NEXT: vpbroadcastb %xmm3, %ymm3
; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX2-NEXT: vpsllw %xmm4, %ymm5, %ymm4
+; AVX2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4
; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsrlw %xmm3, %ymm1, %ymm4
-; AVX512F-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
-; AVX512F-NEXT: vpsrlw %xmm3, %ymm5, %ymm3
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
+; AVX512F-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX512F-NEXT: vpbroadcastb %xmm3, %ymm3
; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpsllw %xmm4, %ymm5, %ymm4
+; AVX512F-NEXT: vpsllw %xmm4, %xmm5, %xmm4
; AVX512F-NEXT: vpbroadcastb %xmm4, %ymm4
; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm4
-; AVX512VL-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsrlw %xmm3, %ymm5, %ymm3
-; AVX512VL-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
+; AVX512VL-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX512VL-NEXT: vpbroadcastb %xmm3, %ymm3
; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsllw %xmm4, %ymm5, %ymm4
+; AVX512VL-NEXT: vpsllw %xmm4, %xmm5, %xmm4
; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4
; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsrlw %xmm5, %ymm2, %ymm6
-; AVX512F-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9
-; AVX512F-NEXT: vpsrlw %xmm5, %ymm9, %ymm8
-; AVX512F-NEXT: vpsrlw $8, %ymm8, %ymm8
-; AVX512F-NEXT: vpbroadcastb %xmm8, %ymm8
-; AVX512F-NEXT: vpand %ymm8, %ymm6, %ymm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm4, %xmm7, %xmm7
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsllw %xmm7, %ymm0, %ymm0
-; AVX512F-NEXT: vpsllw %xmm7, %ymm9, %ymm9
-; AVX512F-NEXT: vpbroadcastb %xmm9, %ymm9
-; AVX512F-NEXT: vpand %ymm9, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm4
+; AVX512F-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
+; AVX512F-NEXT: vpsrlw %xmm5, %xmm8, %xmm7
+; AVX512F-NEXT: vpsrlw $8, %xmm7, %xmm7
+; AVX512F-NEXT: vpbroadcastb %xmm7, %ymm7
+; AVX512F-NEXT: vpand %ymm7, %ymm6, %ymm9
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpsubb %xmm4, %xmm6, %xmm6
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpsllw %xmm6, %ymm0, %ymm10
+; AVX512F-NEXT: vpsllw %xmm6, %xmm8, %xmm0
+; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm8
+; AVX512F-NEXT: vpand %ymm8, %ymm10, %ymm0
+; AVX512F-NEXT: vpor %ymm9, %ymm0, %ymm0
+; AVX512F-NEXT: vpxor %xmm9, %xmm9, %xmm9
+; AVX512F-NEXT: vpcmpeqb %ymm9, %ymm4, %ymm4
; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw %xmm5, %ymm3, %ymm2
-; AVX512F-NEXT: vpand %ymm8, %ymm2, %ymm2
-; AVX512F-NEXT: vpsllw %xmm7, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm9, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm7, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllw %xmm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm8, %ymm1, %ymm1
; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: retq
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsrlw %xmm5, %ymm2, %ymm6
-; AVX512VL-NEXT: vpcmpeqd %ymm9, %ymm9, %ymm9
-; AVX512VL-NEXT: vpsrlw %xmm5, %ymm9, %ymm8
-; AVX512VL-NEXT: vpsrlw $8, %ymm8, %ymm8
-; AVX512VL-NEXT: vpbroadcastb %xmm8, %ymm8
-; AVX512VL-NEXT: vpand %ymm8, %ymm6, %ymm6
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm4, %xmm7, %xmm7
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm7, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsllw %xmm7, %ymm9, %ymm9
-; AVX512VL-NEXT: vpbroadcastb %xmm9, %ymm9
-; AVX512VL-NEXT: vpand %ymm9, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm6, %ymm0, %ymm0
-; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm4
+; AVX512VL-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8
+; AVX512VL-NEXT: vpsrlw %xmm5, %xmm8, %xmm7
+; AVX512VL-NEXT: vpsrlw $8, %xmm7, %xmm7
+; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7
+; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm9
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT: vpsubb %xmm4, %xmm6, %xmm6
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm6, %ymm0, %ymm10
+; AVX512VL-NEXT: vpsllw %xmm6, %xmm8, %xmm0
+; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm8
+; AVX512VL-NEXT: vpand %ymm8, %ymm10, %ymm0
+; AVX512VL-NEXT: vpor %ymm9, %ymm0, %ymm0
+; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9
+; AVX512VL-NEXT: vpcmpeqb %ymm9, %ymm4, %ymm4
; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
; AVX512VL-NEXT: vpsrlw %xmm5, %ymm3, %ymm2
-; AVX512VL-NEXT: vpand %ymm8, %ymm2, %ymm2
-; AVX512VL-NEXT: vpsllw %xmm7, %ymm1, %ymm1
-; AVX512VL-NEXT: vpand %ymm9, %ymm1, %ymm1
+; AVX512VL-NEXT: vpand %ymm7, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllw %xmm6, %ymm1, %ymm1
+; AVX512VL-NEXT: vpand %ymm8, %ymm1, %ymm1
; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
; AVX512VL-NEXT: retq
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
-; AVX512BW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
-; AVX512BW-NEXT: vpsrlw %xmm3, %zmm5, %zmm3
-; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512BW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
+; AVX512BW-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsllw %xmm4, %zmm5, %zmm4
+; AVX512BW-NEXT: vpsllw %xmm4, %xmm5, %xmm4
; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
-; AVX512VBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm5, %zmm3
-; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VBMI2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
+; AVX512VBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm5, %zmm4
+; AVX512VBMI2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
-; AVX512VLBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm5, %zmm3
-; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
+; AVX512VLBW-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm5, %zmm4
+; AVX512VLBW-NEXT: vpsllw %xmm4, %xmm5, %xmm4
; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VLBW-NEXT: vpandq %zmm4, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
-; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm5, %zmm3
-; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
+; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm5, %zmm4
+; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0
; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm3
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpsllw %xmm2, %ymm4, %ymm2
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpsllw %xmm2, %xmm4, %xmm2
; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw %xmm1, %ymm4, %ymm1
-; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
+; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX512F-NEXT: vpsllw %xmm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT: vpsllw %xmm2, %xmm4, %xmm2
; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw %xmm1, %ymm4, %ymm1
-; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
+; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm3
-; AVX512VL-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsllw %xmm2, %ymm4, %ymm2
+; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT: vpsllw %xmm2, %xmm4, %xmm2
; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX512VL-NEXT: vpand %ymm2, %ymm3, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw %xmm1, %ymm4, %ymm1
-; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
+; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4
-; AVX512F-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
-; AVX512F-NEXT: vpsllw %xmm3, %ymm5, %ymm6
+; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6
; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpsubb %xmm2, %xmm7, %xmm2
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw %xmm2, %ymm5, %ymm5
-; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
+; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4
-; AVX512VL-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm5, %ymm6
+; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6
; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VL-NEXT: vpsubb %xmm2, %xmm7, %xmm2
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
+; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm4
-; AVX512BW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
-; AVX512BW-NEXT: vpsrlw %xmm3, %zmm5, %zmm3
-; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512BW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
+; AVX512BW-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsllw %xmm1, %zmm5, %zmm1
+; AVX512BW-NEXT: vpsllw %xmm1, %xmm5, %xmm1
; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm4
-; AVX512VLBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm5, %zmm3
-; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
+; AVX512VLBW-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm5, %zmm1
+; AVX512VLBW-NEXT: vpsllw %xmm1, %xmm5, %xmm1
; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1
; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm3
; AVX512BW-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
-; AVX512BW-NEXT: vpaddq %ymm3, %ymm2, %ymm2
-; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2
+; AVX512BW-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3
; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2
-; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm3
; AVX512BW-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
-; AVX512BW-NEXT: vpaddq %ymm3, %ymm2, %ymm2
-; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2
+; AVX512BW-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3
; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2
-; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm3
-; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpsllw %xmm2, %ymm4, %ymm2
+; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpsllw %xmm2, %xmm4, %xmm2
; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw %xmm1, %ymm4, %ymm1
-; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
+; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX512F-NEXT: vpsllw %xmm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512F-NEXT: vpsllw %xmm2, %xmm4, %xmm2
; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw %xmm1, %ymm4, %ymm1
-; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
+; AVX512F-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm3
-; AVX512VL-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; AVX512VL-NEXT: vpsllw %xmm2, %ymm4, %ymm2
+; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT: vpsllw %xmm2, %xmm4, %xmm2
; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX512VL-NEXT: vpand %ymm2, %ymm3, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw %xmm1, %ymm4, %ymm1
-; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
+; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsllw %xmm3, %ymm0, %ymm4
-; AVX512F-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
-; AVX512F-NEXT: vpsllw %xmm3, %ymm5, %ymm6
+; AVX512F-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT: vpsllw %xmm3, %xmm5, %xmm6
; AVX512F-NEXT: vpbroadcastb %xmm6, %ymm6
; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpsubb %xmm2, %xmm7, %xmm2
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw %xmm2, %ymm5, %ymm5
-; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
+; AVX512F-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX512F-NEXT: vpbroadcastb %xmm5, %ymm5
; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm4
-; AVX512VL-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsllw %xmm3, %ymm5, %ymm6
+; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT: vpsllw %xmm3, %xmm5, %xmm6
; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6
; AVX512VL-NEXT: vpand %ymm6, %ymm4, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VL-NEXT: vpsubb %xmm2, %xmm7, %xmm2
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw %xmm2, %ymm5, %ymm5
-; AVX512VL-NEXT: vpsrlw $8, %ymm5, %ymm5
+; AVX512VL-NEXT: vpsrlw %xmm2, %xmm5, %xmm5
+; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5
; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5
; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm0, %ymm4, %ymm0
; AVX512BW-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm3
-; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512BW-NEXT: vpsllw %xmm2, %zmm4, %zmm2
+; AVX512BW-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT: vpsllw %xmm2, %xmm4, %xmm2
; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw %xmm1, %zmm4, %zmm1
-; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
+; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm3
-; AVX512VLBW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm4, %zmm2
+; AVX512VLBW-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm4, %xmm2
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
; AVX512VLBW-NEXT: vpandq %zmm2, %zmm3, %zmm2
; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm4, %zmm1
-; AVX512VLBW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
+; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1
; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512DQVL-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX512DQVL-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX512DQVL-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX512DQVL-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; X32-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; X32-AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; X32-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; X32-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
+; X32-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; X32-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
; X32-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2
; X32-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
; AVX2: # %bb.0:
; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
-; AVX2-NEXT: vpsraw $1, %ymm0, %ymm0
+; AVX2-NEXT: vpsraw $1, %xmm0, %xmm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7,8],ymm0[9],ymm2[10,11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: retq
; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpmulhw {{\.LCPI.*}}, %ymm0, %ymm1
; X32-AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
-; X32-AVX2-NEXT: vpsraw $1, %ymm0, %ymm0
+; X32-AVX2-NEXT: vpsraw $1, %xmm0, %xmm0
; X32-AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7,8],ymm0[9],ymm2[10,11,12,13,14,15]
; X32-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; X32-AVX2-NEXT: retl
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3
; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
-; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: retq
; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
-; AVX512DQVL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512DQVL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: retq
; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; X32-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; X32-AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; X32-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
-; X32-AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; X32-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; X32-AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
; X32-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; X32-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpsrlw %xmm2, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3
; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX2-NEXT: vpsllw %xmm1, %ymm2, %ymm1
+; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-NEXT: vpsllw %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpsllw %xmm1, %ymm2, %ymm1
+; AVX512DQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpsllw %xmm1, %xmm2, %xmm1
; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: retq
; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512DQVL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512DQVL-NEXT: vpsllw %xmm1, %ymm2, %ymm1
+; AVX512DQVL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512DQVL-NEXT: vpsllw %xmm1, %xmm2, %xmm1
; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: retq
; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; X32-AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
-; X32-AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; X32-AVX2-NEXT: vpsllw %xmm1, %ymm2, %ymm1
+; X32-AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1
; X32-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; X32-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-NEXT: vpsllw %xmm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpsllw %xmm2, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpsllw %xmm2, %xmm3, %xmm3
; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3
; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsllw %xmm2, %ymm1, %ymm1
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsllw %xmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllw %xmm1, %xmm2, %xmm1
; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq