return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
}
-static bool isUseOfShuffle(SDNode *N) {
+static bool isFoldableUseOfShuffle(SDNode *N) {
for (auto *U : N->uses()) {
- if (isTargetShuffle(U->getOpcode()))
+ unsigned Opc = U->getOpcode();
+ // VPERMV/VPERMV3 shuffles can never fold their index operands.
+ if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
+ return false;
+ if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
+ return false;
+ if (isTargetShuffle(Opc))
+ return true;
+ if (Opc == ISD::BITCAST) // Ignore bitcasts
+ return isFoldableUseOfShuffle(U);
+ if (N->hasOneUse())
return true;
- if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
- return isUseOfShuffle(U);
}
return false;
}
SplatBitSize < VT.getSizeInBits()) {
// Avoid replacing with broadcast when it's a use of a shuffle
// instruction to preserve the present custom lowering of shuffles.
- if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
+ if (isFoldableUseOfShuffle(BVOp))
return SDValue();
// replace BUILD_VECTOR with broadcast of the repeated constants.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) {
; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm4
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpermt2w %ymm0, %ymm2, %ymm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) {
; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3:
; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [17,0,23,10,1,8,7,30]
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpermt2w %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; CHECK-NEXT: vpermt2w %ymm4, %ymm3, %ymm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp) {
; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w (%rdi), %ymm1, %ymm0
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17]
+; CHECK-NEXT: vmovdqa 32(%rdi), %ymm0
+; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17]
+; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3
+; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w (%rdi), %ymm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17]
+; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
+; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm2
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w (%rdi), %ymm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
+; CHECK-NEXT: vmovdqa 32(%rdi), %ymm3
+; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w (%rdi), %ymm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [7,6,4,6,12,4,27,1]
+; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
+; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm2
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10]
+; CHECK-NEXT: vmovdqa (%rdi), %ymm3
+; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [6,18,0,4,10,25,22,10]
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vpermt2w 32(%rdi), %ymm1, %ymm2
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp) {
; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9]
+; CHECK-NEXT: vmovdqa (%rdi), %ymm0
+; CHECK-NEXT: vpermt2w 32(%rdi), %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w 32(%rdi), %ymm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9]
+; CHECK-NEXT: vmovdqa (%rdi), %ymm3
+; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
-; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9]
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vpermt2w 32(%rdi), %ymm1, %ymm2
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) {
; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <4,0,3,2,u,u,u,u>
+; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,3,2]
; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <4,0,3,2,u,u,u,u>
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,0,3,2]
; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <4,0,3,2,u,u,u,u>
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,3,2]
; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <3,0,7,3,u,u,u,u>
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,7,3]
; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <3,0,7,3,u,u,u,u>
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,7,3]
; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <5,3,2,5,u,u,u,u>
+; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [5,3,2,5]
; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <5,3,2,5,u,u,u,u>
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,3,2,5]
; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <5,3,2,5,u,u,u,u>
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,3,2,5]
; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1:
; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [5,1,3,4]
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <5,1,3,4,u,u,u,u>
; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1:
; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,1,3,4]
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <5,1,3,4,u,u,u,u>
; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <1,1,13,0,u,u,u,u>
-; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,13,0]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; CHECK-NEXT: vpermt2d %ymm4, %ymm3, %ymm0
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <1,1,13,0,u,u,u,u>
-; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,13,0]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpermt2d %ymm3, %ymm2, %ymm0
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3:
; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [3,0,0,13]
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <3,0,0,13,u,u,u,u>
-; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpermt2d %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <3,0,0,13,u,u,u,u>
-; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [3,0,0,13]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; CHECK-NEXT: vpermt2d %ymm4, %ymm3, %ymm0
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <3,0,0,13,u,u,u,u>
-; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,0,13]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpermt2d %ymm3, %ymm2, %ymm0
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp) {
; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <13,0,0,6,u,u,u,u>
-; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm0
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [13,0,0,6]
+; CHECK-NEXT: vmovdqa (%rdi), %ymm0
+; CHECK-NEXT: vpermt2d 32(%rdi), %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <13,0,0,6,u,u,u,u>
-; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [13,0,0,6]
+; CHECK-NEXT: vmovdqa (%rdi), %ymm3
+; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <13,0,0,6,u,u,u,u>
-; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [13,0,0,6]
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vpermt2d 32(%rdi), %ymm1, %ymm2
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <2,15,6,9,u,u,u,u>
-; CHECK-NEXT: vpermi2d 32(%rdi), %ymm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [2,15,6,9]
+; CHECK-NEXT: vmovdqa (%rdi), %ymm3
+; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm3
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <2,15,6,9,u,u,u,u>
-; CHECK-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,15,6,9]
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vpermt2d 32(%rdi), %ymm1, %ymm2
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <4,1,u,2,u,u,u,u>
-; CHECK-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4,1,0,2]
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vpermps %ymm2, %ymm1, %ymm1
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,2,7]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,7,2,7]
+; CHECK-NEXT: # ymm4 = mem[0,1,0,1]
; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4
; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,7,2,7]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,7,2,7]
+; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z}
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <1,3,5,0,u,u,u,u>
+; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [1,3,5,0]
; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <1,3,5,0,u,u,u,u>
+; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,0]
; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <3,2,7,0,u,u,u,u>
+; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,2,7,0]
; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <3,2,7,0,u,u,u,u>
+; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,2,7,0]
; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) {
; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <3,3,5,2,u,u,u,u>
+; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,5,2]
; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <3,3,5,2,u,u,u,u>
+; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [3,3,5,2]
; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <3,3,5,2,u,u,u,u>
+; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,5,2]
; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) {
; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <12,0,1,2,u,u,u,u>
-; CHECK-NEXT: vpermi2ps %ymm0, %ymm2, %ymm1
+; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [12,0,1,2]
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vpermt2ps %ymm0, %ymm2, %ymm1
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = <12,0,1,2,u,u,u,u>
-; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [12,0,1,2]
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm4
+; CHECK-NEXT: vpermt2ps %ymm0, %ymm3, %ymm4
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vcmpeqps %xmm0, %xmm2, %k1
; CHECK-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1}
define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <12,0,1,2,u,u,u,u>
-; CHECK-NEXT: vpermi2ps %ymm0, %ymm2, %ymm3
+; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [12,0,1,2]
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpermt2ps %ymm0, %ymm2, %ymm3
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vcmpeqps %xmm0, %xmm1, %k1
; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} {z}
define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp) {
; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps (%rdi), %ymm1
-; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <3,3,15,9,u,u,u,u>
-; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm0
+; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,15,9]
+; CHECK-NEXT: vmovaps (%rdi), %ymm0
+; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps (%rdi), %ymm2
-; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <3,3,15,9,u,u,u,u>
-; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm2, %ymm3
+; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9]
+; CHECK-NEXT: vmovaps (%rdi), %ymm3
+; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm3
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps (%rdi), %ymm1
-; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = <3,3,15,9,u,u,u,u>
-; CHECK-NEXT: vpermi2ps 32(%rdi), %ymm1, %ymm2
+; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,15,9]
+; CHECK-NEXT: vmovaps (%rdi), %ymm2
+; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm1, %ymm2
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1} {z}
; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [3,7,3,7]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,7,3,7]
+; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
; CHECK-NEXT: vpermi2pd %ymm0, %ymm2, %ymm1
; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [3,7,3,7]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,7,3,7]
+; CHECK-NEXT: # ymm4 = mem[0,1,0,1]
; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm4
; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [3,7,3,7]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [3,7,3,7]
+; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm2 {%k1} {z}
;
; ALL32-LABEL: f4xi64_i128:
; ALL32: # %bb.0:
-; ALL32-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0]
+; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0]
+; ALL32-NEXT: # ymm1 = mem[0,1,0,1]
; ALL32-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpaddq %xmm3, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0]
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm2
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: retl
;
; AVX2-LABEL: f8xi64_i128:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
;
; AVX512-LABEL: f8xi64_i128:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0]
+; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0]
+; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retl
;
; AVX512-LABEL: f8xi64_i256:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,0,0,1,0,2,0,3,0]
+; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,0,0,1,0,2,0,3,0]
+; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retl
; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: test_reduce_v2i64:
-; X86-AVX: ## %bb.0:
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; X86-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
-; X86-AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vmovd %xmm0, %eax
-; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
-; X86-AVX-NEXT: retl
+; X86-AVX1-LABEL: test_reduce_v2i64:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
+; X86-AVX1-NEXT: ## xmm2 = mem[0,0]
+; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v2i64:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
+; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: retl
;
; X64-SSE2-LABEL: test_reduce_v2i64:
; X64-SSE2: ## %bb.0:
; X86-AVX1-LABEL: test_reduce_v4i64:
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
-; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
+; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
+; X86-AVX1-NEXT: ## xmm2 = mem[0,0]
+; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm3
+; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4
; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm4
; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; X86-AVX2-LABEL: test_reduce_v4i64:
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
; X86-AVX1-LABEL: test_reduce_v8i64:
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
-; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
+; X86-AVX1-NEXT: ## xmm3 = mem[0,0]
+; X86-AVX1-NEXT: vxorps %xmm3, %xmm2, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X86-AVX1-NEXT: vxorps %xmm3, %xmm4, %xmm4
; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
-; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
-; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5
+; X86-AVX1-NEXT: vxorps %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vxorps %xmm3, %xmm0, %xmm5
; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm2
+; X86-AVX1-NEXT: vxorps %xmm3, %xmm1, %xmm2
; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm4
; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4
; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
;
; X86-AVX2-LABEL: test_reduce_v8i64:
; X86-AVX2: ## %bb.0:
-; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4
; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: test_reduce_v2i64:
-; X86-AVX: ## %bb.0:
-; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; X86-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
-; X86-AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vmovd %xmm0, %eax
-; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
-; X86-AVX-NEXT: retl
+; X86-AVX1-LABEL: test_reduce_v2i64:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
+; X86-AVX1-NEXT: ## xmm2 = mem[0,0]
+; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v2i64:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
+; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: retl
;
; X64-SSE2-LABEL: test_reduce_v2i64:
; X64-SSE2: ## %bb.0:
; X86-AVX1-LABEL: test_reduce_v4i64:
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
+; X86-AVX1-NEXT: ## xmm2 = mem[0,0]
+; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm4
; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; X86-AVX2-LABEL: test_reduce_v4i64:
; X86-AVX2: ## %bb.0:
; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
; X86-AVX1-LABEL: test_reduce_v8i64:
; X86-AVX1: ## %bb.0:
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
-; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
+; X86-AVX1-NEXT: ## xmm3 = mem[0,0]
+; X86-AVX1-NEXT: vxorps %xmm3, %xmm2, %xmm2
; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X86-AVX1-NEXT: vxorps %xmm3, %xmm4, %xmm4
; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
-; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4
-; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5
+; X86-AVX1-NEXT: vxorps %xmm3, %xmm0, %xmm4
+; X86-AVX1-NEXT: vxorps %xmm3, %xmm1, %xmm5
; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
-; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vxorps %xmm3, %xmm1, %xmm4
; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
;
; X86-AVX2-LABEL: test_reduce_v8i64:
; X86-AVX2: ## %bb.0:
-; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
;
; X32-AVX-LABEL: clamp_sitofp_2i64_2f64:
; X32-AVX: # %bb.0:
-; X32-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967041,4294967295,4294967041,4294967295]
+; X32-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [NaN,NaN]
+; X32-AVX-NEXT: # xmm1 = mem[0,0]
; X32-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; X32-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X32-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0]
+; X32-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [1.2598673968951787E-321,1.2598673968951787E-321]
+; X32-AVX-NEXT: # xmm1 = mem[0,0]
; X32-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X32-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; X32-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; HSW-LABEL: compressstore_v16f32_const:
; HSW: # %bb.0: # %cond.store
; HSW-NEXT: vmovups %ymm0, (%rdi)
-; HSW-NEXT: vmovaps {{.*#+}} ymm0 = <0,1,2,4,u,u,u,u>
+; HSW-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,4]
; HSW-NEXT: vpermps %ymm1, %ymm0, %ymm0
; HSW-NEXT: vmovups %xmm0, 32(%rdi)
; HSW-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX2-SLOW-NEXT: vmovups (%rdi), %ymm0
; AVX2-SLOW-NEXT: vmovups 32(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovups 64(%rdi), %ymm2
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <u,u,u,u,u,u,2,5>
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482]
; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = <u,u,u,u,u,0,3,6>
+; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
+; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm4
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
; AVX2-FAST-NEXT: vmovups (%rdi), %ymm0
; AVX2-FAST-NEXT: vmovups 32(%rdi), %ymm1
; AVX2-FAST-NEXT: vmovups 64(%rdi), %ymm2
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = <u,u,u,u,u,u,2,5>
+; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482]
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm3
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <u,u,u,u,u,0,3,6>
+; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
+; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm4
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2]
+; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2]
+; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1
; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
;
; X86-AVX2-LABEL: trunc_ashr_v4i64_demandedelts:
; X86-AVX2: # %bb.0:
-; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [63,0,0,0,63,0,0,0]
+; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [63,0,0,0,63,0,0,0]
+; X86-AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; X86-AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2
; CHECK-NEXT: vpmovd2m %zmm0, %k1
; CHECK-NEXT: vmovapd 0, %zmm0
; CHECK-NEXT: vmovapd 64, %zmm1
-; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16]
+; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm2 = [3.3951932655444357E-313,3.3951932655444357E-313,3.3951932655444357E-313,3.3951932655444357E-313,3.3951932655444357E-313,3.3951932655444357E-313,3.3951932655444357E-313,3.3951932655444357E-313]
; CHECK-NEXT: kshiftrw $8, %k1, %k2
; CHECK-NEXT: vorpd %zmm2, %zmm1, %zmm1 {%k2}
; CHECK-NEXT: vorpd %zmm2, %zmm0, %zmm0 {%k1}
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vptestnmb %ymm0, %ymm0, %k0
; AVX512VLBW-NEXT: vpmovm2w %k0, %zmm0
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; AVX512VLBW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; AVX512VLBW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512VLBW-NEXT: vpermw %zmm0, %zmm1, %zmm0
; AVX512VLBW-NEXT: vpmovw2m %zmm0, %k0
; AVX512VLBW-NEXT: vpmovm2b %k0, %ymm0
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_1:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [1,5,9,13,17,21,25,29]
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_2:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,6,10,14,18,22,26,30]
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_3:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [3,7,11,15,19,23,27,31]
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_2:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [1,5,9,13,17,21,25,29]
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_4:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [2,6,10,14,18,22,26,30]
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_6:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [3,7,11,15,19,23,27,31]
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512VBMIVL-NEXT: vpermt2b 32(%rdi), %ymm0, %ymm1
; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
-; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,24,32,40,48,56,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224]
; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: vzeroupper
;
; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
; AVX512VBMIVL: # %bb.0:
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMIVL-NEXT: vpermi2b %ymm2, %ymm0, %ymm1
-; AVX512VBMIVL-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
+; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
;
; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512VBMIVL: # %bb.0:
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMIVL-NEXT: vpermi2b %ymm2, %ymm0, %ymm1
-; AVX512VBMIVL-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
+; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
; AVX512BWVL-LABEL: PR34175:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512VBMIVL-LABEL: PR34175:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0
;
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512VBMIVL-NEXT: vpermt2b 32(%rdi), %ymm0, %ymm1
; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
-; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512BWVL-NEXT: vpmovwb %xmm1, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
-; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512VBMIVL-NEXT: vpmovwb %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
; AVX512VBMIVL: # %bb.0:
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMIVL-NEXT: vpermi2b %ymm2, %ymm0, %ymm1
-; AVX512VBMIVL-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
+; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
;
; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512VBMIVL: # %bb.0:
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMIVL-NEXT: vpermi2b %ymm2, %ymm0, %ymm1
-; AVX512VBMIVL-NEXT: vmovdqa %xmm1, %xmm0
+; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
+; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
; AVX512BWVL-LABEL: PR34175:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512VBMIVL-LABEL: PR34175:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0
;
; X32-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
; X32-AVX512: # %bb.0: # %entry
-; X32-AVX512-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0
-; X32-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0]
+; X32-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,2,0,3,0,4,0]
+; X32-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; X32-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
; X32-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1
; X32-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1
; X32-AVX512-NEXT: vmovdqu %ymm0, ga4
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpsllw $2, %ymm2, %ymm4
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm4
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm2
; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm4
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm4, %ymm4
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm4
; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm7
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm4
; AVX512VL-NEXT: vpsllw $2, %ymm4, %ymm7
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
;
; AVX512BW-LABEL: constant_funnnel_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
;
; AVX512VBMI2-LABEL: constant_funnnel_v64i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VBMI2-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
;
; AVX512VLBW-LABEL: constant_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VLBW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VLVBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VLVBMI2-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpsllw $2, %ymm1, %ymm3
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm3
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm3
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
;
; AVX512BW-LABEL: constant_funnnel_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
;
; AVX512VLBW-LABEL: constant_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VLBW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsllw $2, %ymm0, %ymm2
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm2
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512VL-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm2
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512F-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512F-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm4
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512VL-NEXT: # ymm6 = mem[0,1,0,1]
; AVX512VL-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0
; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm4
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
;
; AVX512BW-LABEL: constant_funnnel_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpmovb2m %zmm2, %k1
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
;
; AVX512VBMI2-LABEL: constant_funnnel_v64i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512VBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512VBMI2-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VBMI2-NEXT: vpmovb2m %zmm2, %k1
; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
;
; AVX512VLBW-LABEL: constant_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512VLBW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VLBW-NEXT: vpmovb2m %zmm2, %k1
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
;
; AVX512VLVBMI2-LABEL: constant_funnnel_v64i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512VLVBMI2-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536,57600,41152,24704,8256,8448,24640,41088,57536]
+; AVX512VLVBMI2-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm2, %k1
; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpsllw $2, %ymm1, %ymm3
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm3
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm3
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
;
; AVX512BW-LABEL: constant_funnnel_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
;
; AVX512VLBW-LABEL: constant_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VLBW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpsllw $2, %ymm1, %ymm3
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm3
; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1
; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm3
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vpsllw $2, %ymm2, %ymm5
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256]
+; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512VL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm2
; AVX512VL-NEXT: vpsllw $2, %ymm2, %ymm5
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
;
; AVX512BW-LABEL: constant_rotate_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
;
; AVX512VLBW-LABEL: constant_rotate_v64i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512VLBW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512VLBW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
; X32-AVX1-LABEL: var_shift_v4i64:
; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X32-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
+; X32-AVX1-NEXT: # xmm3 = mem[0,0]
; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsllw $2, %ymm0, %ymm1
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm1
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; AVX512DQVL-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512DQVL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: vpsllw $2, %ymm0, %ymm1
; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1
-; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; X32-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; X32-AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; X32-AVX2-NEXT: vpsllw $2, %ymm0, %ymm1
; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsllw $2, %ymm0, %ymm2
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
;
; AVX512BW-LABEL: constant_shift_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,0,0,0,1]
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,1,0,0,0,1]
+; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
;
;
; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]
+; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]
+; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0]
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0]
+; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
;
;
; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0]
+; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0]
+; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0]
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0]
+; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
;
;
; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0]
+; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0]
+; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
;
; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0,0,0,0,4,0,0,0,0]
+; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0,0,0,0,4,0,0,0,0]
+; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
;
; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0,0,0,5,0,0,0,0,0]
+; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0,0,0,5,0,0,0,0,0]
+; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
;
; AVX512VL-FAST-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
+; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
+; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
;
; AVX512VL-FAST-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
;
; AVX2-FAST-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,4,4,5,6,4]
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,5,6,4,4,5,6,4]
+; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15,16,17,18,19,20,21,18,19,24,25,26,27,30,31,30,31]
; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
;
; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
; AVX512VLVBMI-FAST: # %bb.0:
-; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-FAST-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
;
; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
; AVX512VLVBMI-FAST: # %bb.0:
-; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-FAST-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
;
; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX512VLVBMI-FAST: # %bb.0:
-; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-FAST-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
;
; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
; AVX512VLVBMI-FAST: # %bb.0:
-; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-FAST-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
;
; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI-FAST: # %bb.0:
-; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-FAST-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
;
; AVX512VLVBMI-FAST-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI-FAST: # %bb.0:
-; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512VLVBMI-FAST-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-FAST-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
;
; AVX2-SLOW-LABEL: shuffle_v8f32_c348cda0:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u>
+; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0]
+; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4]
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,3,4,7,4,7,2,0]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u>
+; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0]
+; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]
; AVX2-FAST-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v8f32_32103210:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
+; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
+; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
;
;
; AVX512VL-FAST-LABEL: shuffle_v8f32_32103210:
; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
-; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
+; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
ret <8 x float> %shuffle
;
; AVX2-FAST-LABEL: shuffle_v8f32_76547654:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
;
;
; AVX512VL-FAST-LABEL: shuffle_v8f32_76547654:
; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
-; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
ret <8 x float> %shuffle
;
; AVX2-FAST-LABEL: shuffle_v8i32_32103210:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
+; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
+; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
;
;
; AVX512VL-FAST-LABEL: shuffle_v8i32_32103210:
; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
-; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0]
+; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
ret <8 x i32> %shuffle
;
; AVX2-FAST-LABEL: shuffle_v8i32_76547654:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
;
;
; AVX512VL-FAST-LABEL: shuffle_v8i32_76547654:
; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
-; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512VL-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
; AVX2-LABEL: lowhalf_v8i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,6,u,u,u,u>
+; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,6,2,6,3,6]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: lowhalf_v8i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <2,14,3,14,u,u,u,u>
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,14,3,14,2,14,3,14]
+; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512VL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 2, i32 14, i32 3, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-LABEL: lowhalf_v8f32:
; AVX2: # %bb.0:
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,6,u,u,u,u>
+; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,6,2,6,3,6]
+; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: lowhalf_v8f32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = <2,14,3,14,u,u,u,u>
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,14,3,14,2,14,3,14]
+; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 2, i32 14, i32 3, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
; ALL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT: vmovaps {{.*#+}} ymm2 = <u,u,0,4,u,u,u,u>
+; ALL-NEXT: vbroadcastsd {{.*#+}} ymm2 = [17179869184,17179869184,17179869184,17179869184]
; ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0
; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; ALL-NEXT: vzeroupper
;
; SKX-LABEL: pr32967:
; SKX: ## %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29]
; SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
-; SKX-NEXT: vpermi2w %ymm2, %ymm0, %ymm1
-; SKX-NEXT: vmovdqa %xmm1, %xmm0
+; SKX-NEXT: vpermt2w %ymm2, %ymm1, %ymm0
+; SKX-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%shuffle = shufflevector <32 x i16> %v, <32 x i16> undef, <8 x i32> <i32 1,i32 5,i32 9,i32 13,i32 17,i32 21,i32 25,i32 29>
;
; AVX512F-LABEL: shuffle_v8f64_08080808:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8]
+; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8]
+; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_08080808:
; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,0,0,8,0,0,0,8,0]
+; AVX512F-32-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,0,0,8,0,0,0,8,0]
+; AVX512F-32-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
;
; AVX512F-LABEL: shuffle_v8i64_08080808:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8]
+; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8]
+; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_08080808:
; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,0,0,8,0,0,0,8,0]
+; AVX512F-32-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,0,0,8,0,0,0,8,0]
+; AVX512F-32-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
; X86-AVX512: # %bb.0: # %entry
; X86-AVX512-NEXT: vmovups 0, %zmm0
; X86-AVX512-NEXT: vmovups 64, %ymm1
-; X86-AVX512-NEXT: vmovaps {{.*#+}} zmm2 = <2,5,8,11,14,17,20,23,u,u,u,u,u,u,u,u>
+; X86-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23]
; X86-AVX512-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2
; X86-AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X86-AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm1
; X64-AVX512: # %bb.0: # %entry
; X64-AVX512-NEXT: vmovups 0, %zmm0
; X64-AVX512-NEXT: vmovups 64, %ymm1
-; X64-AVX512-NEXT: vmovaps {{.*#+}} zmm2 = <2,5,8,11,14,17,20,23,u,u,u,u,u,u,u,u>
+; X64-AVX512-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23]
; X64-AVX512-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2
; X64-AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm1
define <32 x i8> @combine_vpermi2var_32i8_as_vpermb(<32 x i8> %x0, <32 x i8> %x1) {
; CHECK-LABEL: combine_vpermi2var_32i8_as_vpermb:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
; CHECK-NEXT: vpermb %ymm0, %ymm1, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
%res0 = shufflevector <32 x i8> %x0, <32 x i8> %x1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
define <64 x i8> @combine_vpermi2var_64i8_as_vpermb(<64 x i8> %x0, <64 x i8> %x1) {
; CHECK-LABEL: combine_vpermi2var_64i8_as_vpermb:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; CHECK-NEXT: vpermb %zmm0, %zmm1, %zmm0
; CHECK-NEXT: ret{{[l|q]}}
%res0 = shufflevector <64 x i8> %x0, <64 x i8> %x1, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
define <32 x i8> @combine_vpermi2var_32i8_as_vperm2(<32 x i8> %x0, <32 x i8> %x1) {
; CHECK-LABEL: combine_vpermi2var_32i8_as_vperm2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
; CHECK-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
%res0 = shufflevector <32 x i8> %x0, <32 x i8> %x1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
; AVX2-FAST-LABEL: combine_unneeded_subvector1:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
+; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
%b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
; VL_BW_DQ-NEXT: vpsllw $7, %ymm0, %ymm0
; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
-; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vptestnmw %zmm0, %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
-; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; VL_BW_DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0
; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
; VL_BW_DQ-NEXT: vpblendmw %zmm1, %zmm2, %zmm0 {%k1}
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vptestnmb %ymm0, %ymm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
-; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; VL_BW_DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0
; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
; VL_BW_DQ-NEXT: vpblendmb %ymm1, %ymm2, %ymm0 {%k1}
; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1
; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
-; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
; VL_BW_DQ-NEXT: vpblendmw %zmm2, %zmm3, %zmm0 {%k1}
; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1
; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
-; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1
; VL_BW_DQ-NEXT: vpblendmb %ymm2, %ymm3, %ymm0 {%k1}