; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c
+define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A) {
+; X32-LABEL: test_mm512_mask_set1_epi8:
+; X32: # BB#0: # %entry
+; X32-NEXT: pushl %ebx
+; X32-NEXT: .Lcfi0:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: .Lcfi1:
+; X32-NEXT: .cfi_offset %ebx, -8
+; X32-NEXT: vmovdqa64 %zmm0, %zmm3
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andb $2, %cl
+; X32-NEXT: shrb %cl
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vpsllw $8, %xmm1, %xmm1
+; X32-NEXT: kmovd %eax, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vpbroadcastw %xmm2, %xmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm2
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vpslld $24, %xmm2, %xmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm2
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $4, %cl
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vpbroadcastd %xmm2, %xmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm2
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $5, %cl
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vpsllq $40, %xmm2, %xmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm2
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $6, %cl
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vpbroadcastw %xmm2, %xmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm2
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $7, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpsllq $56, %xmm1, %xmm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: movb %ah, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpbroadcastq %xmm1, %xmm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: andb $2, %cl
+; X32-NEXT: shrb %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6]
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: movb %ah, %cl
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpbroadcastw %xmm1, %xmm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $61440, %ecx # imm = 0xF000
+; X32-NEXT: shrl $12, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpbroadcastd %xmm1, %xmm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $13, %ecx
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2]
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $49152, %ecx # imm = 0xC000
+; X32-NEXT: shrl $14, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpbroadcastw %xmm1, %xmm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $32768, %ecx # imm = 0x8000
+; X32-NEXT: shrl $15, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $16, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $2, %dl
+; X32-NEXT: shrb %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpsllw $8, %xmm1, %xmm1
+; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $15, %dl
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $2, %bl
+; X32-NEXT: kmovd %ebx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpbroadcastw %xmm1, %xmm1
+; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: shrb $3, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpslld $24, %xmm1, %xmm1
+; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $4, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpbroadcastd %xmm1, %xmm1
+; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $5, %dl
+; X32-NEXT: andb $1, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpsllq $40, %xmm1, %xmm1
+; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $6, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpbroadcastw %xmm1, %xmm1
+; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: shrb $7, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpsllq $56, %xmm1, %xmm1
+; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $24, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpbroadcastq %xmm1, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $2, %dl
+; X32-NEXT: shrb %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6]
+; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpbroadcastw %xmm1, %xmm1
+; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
+; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm4, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $28, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpbroadcastd %xmm1, %xmm1
+; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; X32-NEXT: vpblendvb %ymm0, %ymm4, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm1, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $29, %ecx
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2]
+; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; X32-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $30, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; X32-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: shrl $31, %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; X32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT: vpmovm2b %k1, %zmm7
+; X32-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm6, %ymm1, %ymm7, %ymm1
+; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andb $2, %cl
+; X32-NEXT: shrb %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpsllw $8, %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslld $24, %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $4, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastd %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $5, %cl
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpsllq $40, %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $6, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $7, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpsllq $56, %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movb %ah, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastq %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: andb $2, %cl
+; X32-NEXT: shrb %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movb %ah, %cl
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $61440, %ecx # imm = 0xF000
+; X32-NEXT: shrl $12, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastd %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $13, %ecx
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $49152, %ecx # imm = 0xC000
+; X32-NEXT: shrl $14, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $32768, %ecx # imm = 0x8000
+; X32-NEXT: shrl $15, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $16, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $2, %dl
+; X32-NEXT: shrb %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpsllw $8, %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $15, %dl
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $2, %bl
+; X32-NEXT: kmovd %ebx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: shrb $3, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslld $24, %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $4, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastd %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $5, %dl
+; X32-NEXT: andb $1, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpsllq $40, %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $6, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: shrb $7, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpsllq $56, %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $24, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastq %xmm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm6, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $2, %dl
+; X32-NEXT: shrb %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm6
+; X32-NEXT: vpblendvb %ymm5, %ymm6, %ymm0, %ymm0
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vpbroadcastw %xmm1, %xmm1
+; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm5
+; X32-NEXT: vpblendvb %ymm2, %ymm5, %ymm1, %ymm2
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $29, %ecx
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k2
+; X32-NEXT: vpmovm2b %k2, %zmm0
+; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
+; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
+; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $28, %ecx
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vpbroadcastd %xmm2, %xmm2
+; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; X32-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
+; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT: vpmovm2b %k1, %zmm2
+; X32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
+; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; X32-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
+; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $30, %ecx
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vpbroadcastw %xmm2, %xmm2
+; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; X32-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: shrl $31, %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kshiftlq $1, %k0, %k0
+; X32-NEXT: kshiftrq $1, %k0, %k0
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: korq %k1, %k0, %k1
+; X32-NEXT: vpbroadcastb %eax, %zmm3 {%k1}
+; X32-NEXT: vmovdqa64 %zmm3, %zmm0
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_set1_epi8:
+; X64: # BB#0: # %entry
+; X64-NEXT: kmovq %rdi, %k1
+; X64-NEXT: vpbroadcastb %esi, %zmm0 {%k1}
+; X64-NEXT: retq
+ entry:
+ %vecinit.i.i = insertelement <64 x i8> undef, i8 %__A, i32 0
+ %vecinit63.i.i = shufflevector <64 x i8> %vecinit.i.i, <64 x i8> undef, <64 x i32> zeroinitializer
+ %0 = bitcast <8 x i64> %__O to <64 x i8>
+ %1 = bitcast i64 %__M to <64 x i1>
+ %2 = select <64 x i1> %1, <64 x i8> %vecinit63.i.i, <64 x i8> %0
+ %3 = bitcast <64 x i8> %2 to <8 x i64>
+ ret <8 x i64> %3
+}
+
+define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
+; X32-LABEL: test_mm512_maskz_set1_epi8:
+; X32: # BB#0: # %entry
+; X32-NEXT: pushl %ebx
+; X32-NEXT: .Lcfi2:
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: .Lcfi3:
+; X32-NEXT: .cfi_offset %ebx, -8
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andb $2, %cl
+; X32-NEXT: shrb %cl
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: vpsllw $8, %xmm0, %xmm0
+; X32-NEXT: kmovd %eax, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vpbroadcastw %xmm1, %xmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vpslld $24, %xmm1, %xmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $4, %cl
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vpbroadcastd %xmm1, %xmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $5, %cl
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vpsllq $40, %xmm1, %xmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $6, %cl
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vpbroadcastw %xmm1, %xmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $7, %cl
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vpsllq $56, %xmm1, %xmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movb %ah, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastq %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: andb $2, %cl
+; X32-NEXT: shrb %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movb %ah, %cl
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $61440, %ecx # imm = 0xF000
+; X32-NEXT: shrl $12, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastd %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $13, %ecx
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $49152, %ecx # imm = 0xC000
+; X32-NEXT: shrl $14, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $32768, %ecx # imm = 0x8000
+; X32-NEXT: shrl $15, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $16, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $2, %dl
+; X32-NEXT: shrb %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpsllw $8, %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $15, %dl
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $2, %bl
+; X32-NEXT: kmovd %ebx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: shrb $3, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslld $24, %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $4, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastd %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $5, %dl
+; X32-NEXT: andb $1, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpsllq $40, %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $6, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: shrb $7, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpsllq $56, %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $24, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastq %xmm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $2, %dl
+; X32-NEXT: shrb %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $28, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastd %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; X32-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $29, %ecx
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; X32-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $30, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; X32-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: shrl $31, %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; X32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT: vpmovm2b %k1, %zmm7
+; X32-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
+; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andb $2, %cl
+; X32-NEXT: shrb %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpsllw $8, %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslld $24, %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $4, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastd %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $5, %cl
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpsllq $40, %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $6, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $7, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpsllq $56, %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movb %ah, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastq %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: andb $2, %cl
+; X32-NEXT: shrb %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movb %ah, %cl
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $61440, %ecx # imm = 0xF000
+; X32-NEXT: shrl $12, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastd %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $13, %ecx
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $49152, %ecx # imm = 0xC000
+; X32-NEXT: shrl $14, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andl $32768, %ecx # imm = 0x8000
+; X32-NEXT: shrl $15, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $16, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $2, %dl
+; X32-NEXT: shrb %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpsllw $8, %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $15, %dl
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $2, %bl
+; X32-NEXT: kmovd %ebx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: shrb $3, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpslld $24, %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $4, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastd %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $5, %dl
+; X32-NEXT: andb $1, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpsllq $40, %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $6, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastw %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: shrb $7, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpsllq $56, %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $24, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpmovm2b %k1, %zmm0
+; X32-NEXT: vpbroadcastq %xmm0, %ymm0
+; X32-NEXT: vpmovm2b %k0, %zmm1
+; X32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; X32-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $2, %dl
+; X32-NEXT: shrb %dl
+; X32-NEXT: kmovd %edx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: vextracti64x4 $1, %zmm2, %ymm1
+; X32-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm1
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k0
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $29, %ecx
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k3
+; X32-NEXT: vpmovm2b %k3, %zmm0
+; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vpbroadcastw %xmm2, %xmm2
+; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT: vpmovm2b %k1, %zmm2
+; X32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
+; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; X32-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $28, %ecx
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vpbroadcastd %xmm2, %xmm2
+; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; X32-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT: vpmovm2b %k2, %zmm2
+; X32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
+; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; X32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; X32-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
+; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-NEXT: vpmovb2m %zmm0, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $30, %ecx
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: vpbroadcastw %xmm2, %xmm2
+; X32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; X32-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-NEXT: shrl $31, %eax
+; X32-NEXT: kmovd %eax, %k0
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: vpmovb2m %zmm0, %k1
+; X32-NEXT: kshiftlq $1, %k1, %k1
+; X32-NEXT: kshiftrq $1, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k0, %k0
+; X32-NEXT: korq %k0, %k1, %k1
+; X32-NEXT: vpbroadcastb %eax, %zmm0 {%k1} {z}
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_set1_epi8:
+; X64: # BB#0: # %entry
+; X64-NEXT: kmovq %rdi, %k1
+; X64-NEXT: vpbroadcastb %esi, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ entry:
+ %vecinit.i.i = insertelement <64 x i8> undef, i8 %__A, i32 0
+ %vecinit63.i.i = shufflevector <64 x i8> %vecinit.i.i, <64 x i8> undef, <64 x i32> zeroinitializer
+ %0 = bitcast i64 %__M to <64 x i1>
+ %1 = select <64 x i1> %0, <64 x i8> %vecinit63.i.i, <64 x i8> zeroinitializer
+ %2 = bitcast <64 x i8> %1 to <8 x i64>
+ ret <8 x i64> %2
+}
+
+define <8 x i64> @test_mm512_mask_set1_epi16(<8 x i64> %__O, i32 %__M, i16 signext %__A) {
+; X32-LABEL: test_mm512_mask_set1_epi16:
+; X32: # BB#0: # %entry
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpbroadcastw %eax, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_set1_epi16:
+; X64: # BB#0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastw %esi, %zmm0 {%k1}
+; X64-NEXT: retq
+ entry:
+ %vecinit.i.i = insertelement <32 x i16> undef, i16 %__A, i32 0
+ %vecinit31.i.i = shufflevector <32 x i16> %vecinit.i.i, <32 x i16> undef, <32 x i32> zeroinitializer
+ %0 = bitcast <8 x i64> %__O to <32 x i16>
+ %1 = bitcast i32 %__M to <32 x i1>
+ %2 = select <32 x i1> %1, <32 x i16> %vecinit31.i.i, <32 x i16> %0
+ %3 = bitcast <32 x i16> %2 to <8 x i64>
+ ret <8 x i64> %3
+}
+
+define <8 x i64> @test_mm512_maskz_set1_epi16(i32 %__M, i16 signext %__A) {
+; X32-LABEL: test_mm512_maskz_set1_epi16:
+; X32: # BB#0: # %entry
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpbroadcastw %eax, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_set1_epi16:
+; X64: # BB#0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastw %esi, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ entry:
+ %vecinit.i.i = insertelement <32 x i16> undef, i16 %__A, i32 0
+ %vecinit31.i.i = shufflevector <32 x i16> %vecinit.i.i, <32 x i16> undef, <32 x i32> zeroinitializer
+ %0 = bitcast i32 %__M to <32 x i1>
+ %1 = select <32 x i1> %0, <32 x i16> %vecinit31.i.i, <32 x i16> zeroinitializer
+ %2 = bitcast <32 x i16> %1 to <8 x i64>
+ ret <8 x i64> %2
+}
+
define <8 x i64> @test_mm512_broadcastb_epi8(<2 x i64> %a0) {
; X32-LABEL: test_mm512_broadcastb_epi8:
; X32: # BB#0: