define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,8,6,12,4,7,9,14,8]
-; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
-; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [8,6,12,4,7,9,14,8]
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,12,9,4,14,15,12,14,4,12,9,4,14,15,12,14]
-; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
-; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,12,9,4,14,15,12,14]
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,4,11,14,10,7,1,6,9]
-; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
-; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11,14,10,7,1,6,9]
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [14,15,7,13,4,12,8,0,14,15,7,13,4,12,8,0]
-; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
-; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,7,13,4,12,8,0]
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [22,27,7,10,13,21,5,14]
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpermt2w %ymm0, %ymm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [22,27,7,10,13,21,5,14]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermt2w %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %xmm2, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,21,27,10,8,19,14,5]
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [15,13,18,16,9,11,26,8]
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [17,0,23,10,1,8,7,30]
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0
; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermt2w %ymm3, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [16,17,5,1,14,14,13,17]
-; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [16,17,5,1,14,14,13,17]
+; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [7,6,4,6,12,4,27,1]
-; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
-; CHECK-NEXT: vpermt2w (%rdi), %ymm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,4,6,12,4,27,1]
+; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermt2w (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [6,18,0,4,10,25,22,10]
-; CHECK-NEXT: vmovdqa (%rdi), %ymm2
-; CHECK-NEXT: vpermt2w 32(%rdi), %ymm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [6,18,0,4,10,25,22,10]
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [19,1,5,31,9,12,17,9]
-; CHECK-NEXT: vmovdqa (%rdi), %ymm2
-; CHECK-NEXT: vpermt2w 32(%rdi), %ymm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [19,1,5,31,9,12,17,9]
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermt2w 32(%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %vp
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,3,2]
-; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,7,3]
-; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,3,2,5]
-; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12]
-; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,12,4,6,4,12]
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %xmm2, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [5,1,3,4]
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,13,0]
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpermt2d %ymm3, %ymm2, %ymm0
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermt2d %ymm3, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [3,0,0,13]
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpermt2d %ymm3, %ymm2, %ymm0
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermt2d %ymm3, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [13,0,0,6]
-; CHECK-NEXT: vmovdqa (%rdi), %ymm2
-; CHECK-NEXT: vpermt2d 32(%rdi), %ymm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [13,0,0,6]
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [15,5,3,2,15,5,7,6]
-; CHECK-NEXT: vpermi2d (%rdi), %ymm1, %ymm2
+; CHECK-NEXT: vmovdqa 32(%rdi), %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [15,5,3,2,15,5,7,6]
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermi2d (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,15,6,9]
-; CHECK-NEXT: vmovdqa (%rdi), %ymm2
-; CHECK-NEXT: vpermt2d 32(%rdi), %ymm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [2,15,6,9]
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermt2d 32(%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %vp
define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,2,3]
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3]
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7]
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,2,3,7,4,6,7]
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,0]
-; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,2,7,0]
-; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,5,2]
-; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [12,0,1,2]
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vpermt2ps %ymm0, %ymm2, %ymm3
-; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vcmpeqps %xmm0, %xmm1, %k1
-; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [12,0,1,2]
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
+; CHECK-NEXT: vpermt2ps %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps 32(%rdi), %ymm1
-; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,10,6,15,4,14,6,15]
-; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm2
-; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
-; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,10,6,15,4,14,6,15]
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
+; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x float>, <16 x float>* %vp
define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps 32(%rdi), %ymm1
-; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [4,14,4,14,4,14,6,7]
-; CHECK-NEXT: vpermi2ps (%rdi), %ymm1, %ymm2
-; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
-; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT: vmovaps 32(%rdi), %ymm2
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,14,4,14,4,14,6,7]
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
+; CHECK-NEXT: vpermi2ps (%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x float>, <16 x float>* %vp
define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [3,3,15,9]
-; CHECK-NEXT: vmovaps (%rdi), %ymm2
-; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm1, %ymm2
-; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
-; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,3,15,9]
+; CHECK-NEXT: vmovaps (%rdi), %ymm1
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
+; CHECK-NEXT: vpermt2ps 32(%rdi), %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x float>, <16 x float>* %vp
define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %mask) {
; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
-; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3]
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %mask) {
; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3]
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
-; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,2,3]
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [1,4,1,4,1,4,1,4]
-; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm1
+; CHECK-NEXT: vmovapd {{.*#+}} xmm1 = [1,4]
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
-; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp