if (NumUpperHalves == 1) {
// AVX2 has efficient 32/64-bit element cross-lane shuffles.
if (Subtarget.hasAVX2()) {
- // extract128 + vunpckhps, is better than vblend + vpermps.
- // TODO: Refine to account for unary shuffle, splat, and other masks?
- if (EltWidth == 32 && NumLowerHalves &&
- HalfVT.is128BitVector() && !is128BitUnpackShuffleMask(HalfMask))
+ // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
+ if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
+ !is128BitUnpackShuffleMask(HalfMask) &&
+ (!isSingleSHUFPSMask(HalfMask) ||
+ Subtarget.hasFastVariableShuffle()))
return SDValue();
// If this is a unary shuffle (assume that the 2nd operand is
// canonicalized to undef), then we can use vpermpd. Otherwise, we
define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
; X32-SLOW-LABEL: trunc4:
; X32-SLOW: # %bb.0:
-; X32-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; X32-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X32-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; X32-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X32-SLOW-NEXT: vzeroupper
; X32-SLOW-NEXT: retl
;
;
; X64-SLOW-LABEL: trunc4:
; X64-SLOW: # %bb.0:
-; X64-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; X64-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X64-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; X64-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X64-SLOW-NEXT: vzeroupper
; X64-SLOW-NEXT: retq
;
define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
; X32-SLOW-LABEL: srl_trunc_and_v4i64:
; X32-SLOW: # %bb.0:
-; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; X32-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; X32-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
-; X32-SLOW-NEXT: vpand %xmm2, %xmm1, %xmm1
+; X32-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X32-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; X32-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
+; X32-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1
; X32-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; X32-SLOW-NEXT: vzeroupper
; X32-SLOW-NEXT: retl
;
; X64-SLOW-LABEL: srl_trunc_and_v4i64:
; X64-SLOW: # %bb.0:
-; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; X64-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; X64-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
-; X64-SLOW-NEXT: vpand %xmm2, %xmm1, %xmm1
+; X64-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; X64-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8]
+; X64-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1
; X64-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; X64-SLOW-NEXT: vzeroupper
; X64-SLOW-NEXT: retq
;
; AVX-SLOW-LABEL: combine_vec_shl_trunc_and:
; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX-SLOW-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1
; AVX-SLOW-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vzeroupper
; AVX-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_and:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr1:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_and:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: v12i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6>
-; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm3, %ymm0
-; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
-; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdi)
-; AVX2-SLOW-NEXT: vmovaps %xmm2, 32(%rdi)
+; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6>
+; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm3
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[3,3]
+; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; AVX2-SLOW-NEXT: vmovaps %xmm0, 32(%rdi)
+; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdi)
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-LABEL: trunc_shl_7_v4i32_v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vmovaps (%rsi), %xmm0
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
; AVX2-NEXT: vpslld $7, %xmm0, %xmm0
; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
-; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%val = load <4 x i64>, <4 x i64> addrspace(1)* %in
%shl = shl <4 x i64> %val, <i64 7, i64 7, i64 7, i64 7>
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi)
-; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32:
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi)
-; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32:
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[u],zero
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16:
;
; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8:
; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
;
; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
;
; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vpmulld %xmm7, %xmm3, %xmm3
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm8
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm7[0,2],xmm8[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2]
+; AVX2-SLOW-NEXT: vpmulld %xmm8, %xmm3, %xmm3
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2]
; AVX2-SLOW-NEXT: vpmulld %xmm6, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm6, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
; AVX2-SLOW-NEXT: vpmulld %xmm5, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
; AVX2-SLOW-NEXT: vpmulld %xmm4, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
;
; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
;
; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vandps %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vandps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vandps %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vxorps %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
;
; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpxor %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpxor %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vpxor %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vxorps %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vxorps %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vxorps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vxorps %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vorps %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vorps %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
;
; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpor %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpor %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vpor %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vorps %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vorps %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vorps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vorps %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
;
; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
;
; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vpmulld %xmm7, %xmm3, %xmm3
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm8
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm7[0,2],xmm8[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2]
+; AVX2-SLOW-NEXT: vpmulld %xmm8, %xmm3, %xmm3
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2]
; AVX2-SLOW-NEXT: vpmulld %xmm6, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm6, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm7
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
; AVX2-SLOW-NEXT: vpmulld %xmm5, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
; AVX2-SLOW-NEXT: vpmulld %xmm4, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
;
; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
;
; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vandps %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vandps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vandps %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vxorps %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vxorps %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
;
; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpxor %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpxor %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vpxor %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vxorps %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vxorps %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vxorps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vxorps %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vorps %ymm2, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vorps %ymm3, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
;
; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpor %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpor %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vpor %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vorps %ymm4, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vorps %ymm5, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vorps %ymm6, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vorps %ymm7, %ymm3, %ymm3
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-SLOW-NEXT: vpand %ymm1, %ymm3, %ymm1
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
-; AVX2-SLOW-NEXT: vpand %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
+; AVX2-SLOW-NEXT: vpand %ymm0, %ymm3, %ymm0
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
+; AVX2-SLOW-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i32:
; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-SLOW-NEXT: vpand %ymm1, %ymm3, %ymm1
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
-; AVX2-SLOW-NEXT: vpand %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
+; AVX2-SLOW-NEXT: vpand %ymm0, %ymm3, %ymm0
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
+; AVX2-SLOW-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i32:
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_ssat_v8i64_v8i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
+; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_ssat_v8i64_v8i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
+; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
+; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729]
; AVX2-SLOW-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm4
+; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm4
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm3
+; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
+; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3
; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729]
; AVX2-SLOW-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
; AVX2-SLOW-NEXT: retq
;
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm4
+; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm4
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm3
+; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
+; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3
; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-SLOW-LABEL: trunc8i64_8i32:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc8i64_8i32_lshr:
;
; AVX2-SLOW-LABEL: trunc8i64_8i16:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
;
; AVX2-SLOW-LABEL: trunc2x4i64_8i32:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-SLOW-LABEL: trunc8i64_8i32:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1
; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc8i64_8i32_lshr:
;
; AVX2-SLOW-LABEL: trunc8i64_8i16:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
;
; AVX2-SLOW-LABEL: trunc8i64_8i8:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
;
; AVX2-SLOW-LABEL: trunc2x4i64_8i32:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: retq
;
;
; AVX2-SLOW-LABEL: trunc2x4i64_8i16:
; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0