; AVX1-NEXT: shlq $32, %rdx
; AVX1-NEXT: orq %rcx, %rdx
; AVX1-NEXT: vmovq %rdx, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_4f32_to_8i16_undef:
; AVX2-NEXT: shlq $32, %rdx
; AVX2-NEXT: orq %rcx, %rdx
; AVX2-NEXT: vmovq %rdx, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: cvt_4f32_to_8i16_undef:
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f32_to_8i16_undef:
; AVX512VL-NEXT: orq %rcx, %rdx
; AVX512VL-NEXT: vmovq %rdx, %xmm0
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512VL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
; AVX1-NEXT: shlq $32, %rdx
; AVX1-NEXT: orq %rcx, %rdx
; AVX1-NEXT: vmovq %rdx, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_4f32_to_8i16_zero:
; AVX2-NEXT: shlq $32, %rdx
; AVX2-NEXT: orq %rcx, %rdx
; AVX2-NEXT: vmovq %rdx, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512F-LABEL: cvt_4f32_to_8i16_zero:
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f32_to_8i16_zero:
; AVX512VL-NEXT: orq %rcx, %rdx
; AVX512VL-NEXT: vmovq %rdx, %xmm0
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX1-NEXT: shlq $32, %rdx
; AVX1-NEXT: orq %rcx, %rdx
; AVX1-NEXT: vmovq %rdx, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-NEXT: shlq $32, %rdx
; AVX2-NEXT: orq %rcx, %rdx
; AVX2-NEXT: vmovq %rdx, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
; AVX2-NEXT: retq
;
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-NEXT: orq %rcx, %rdx
; AVX512VL-NEXT: vmovq %rdx, %xmm0
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rdi)
; AVX512VL-NEXT: retq
; AVX1-NEXT: shlq $32, %rdx
; AVX1-NEXT: orq %rcx, %rdx
; AVX1-NEXT: vmovq %rdx, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vmovdqa %xmm0, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-NEXT: shlq $32, %rdx
; AVX2-NEXT: orq %rcx, %rdx
; AVX2-NEXT: vmovq %rdx, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
; AVX2-NEXT: retq
;
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-NEXT: orq %rcx, %rdx
; AVX512VL-NEXT: vmovq %rdx, %xmm0
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX1-NEXT: shlq $32, %rax
; AVX1-NEXT: orq %r14, %rax
; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX1-NEXT: addq $40, %rsp
; AVX1-NEXT: popq %rbx
; AVX1-NEXT: popq %r14
; AVX2-NEXT: shlq $32, %rax
; AVX2-NEXT: orq %r14, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-NEXT: addq $40, %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r14
; AVX512F-NEXT: shlq $32, %rax
; AVX512F-NEXT: orq %r14, %rax
; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: addq $40, %rsp
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r14
; AVX512VL-NEXT: orq %r14, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm0
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512VL-NEXT: addq $40, %rsp
; AVX512VL-NEXT: popq %rbx
; AVX1-NEXT: shlq $32, %rax
; AVX1-NEXT: orq %r14, %rax
; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: addq $40, %rsp
; AVX1-NEXT: popq %rbx
; AVX1-NEXT: popq %r14
; AVX2-NEXT: shlq $32, %rax
; AVX2-NEXT: orq %r14, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: addq $40, %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r14
; AVX512F-NEXT: shlq $32, %rax
; AVX512F-NEXT: orq %r14, %rax
; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: addq $40, %rsp
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r14
; AVX512VL-NEXT: orq %r14, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm0
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX1-NEXT: shlq $32, %rax
; AVX1-NEXT: orq %rbx, %rax
; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX1-NEXT: vmovdqa %xmm0, (%r14)
; AVX1-NEXT: addq $32, %rsp
; AVX1-NEXT: popq %rbx
; AVX2-NEXT: shlq $32, %rax
; AVX2-NEXT: orq %rbx, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-NEXT: vmovdqa %xmm0, (%r14)
; AVX2-NEXT: addq $32, %rsp
; AVX2-NEXT: popq %rbx
; AVX512F-NEXT: shlq $32, %rax
; AVX512F-NEXT: orq %rbx, %rax
; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: vmovdqa %xmm0, (%r14)
; AVX512F-NEXT: addq $32, %rsp
; AVX512F-NEXT: popq %rbx
; AVX512VL-NEXT: orq %rbx, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm0
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%r14)
; AVX512VL-NEXT: addq $32, %rsp
; AVX1-NEXT: shlq $32, %rax
; AVX1-NEXT: orq %rbx, %rax
; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vmovdqa %xmm0, (%r14)
; AVX1-NEXT: addq $32, %rsp
; AVX1-NEXT: popq %rbx
; AVX2-NEXT: shlq $32, %rax
; AVX2-NEXT: orq %rbx, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vmovdqa %xmm0, (%r14)
; AVX2-NEXT: addq $32, %rsp
; AVX2-NEXT: popq %rbx
; AVX512F-NEXT: shlq $32, %rax
; AVX512F-NEXT: orq %rbx, %rax
; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vmovdqa %xmm0, (%r14)
; AVX512F-NEXT: addq $32, %rsp
; AVX512F-NEXT: popq %rbx
; AVX512VL-NEXT: orq %rbx, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm0
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]