return Cst;
// We can only combine unary and binary shuffle mask cases.
- if (Ops.size() > 2)
- return SDValue();
-
- // Minor canonicalization of the accumulated shuffle mask to make it easier
- // to match below. All this does is detect masks with sequential pairs of
- // elements, and shrink them to the half-width mask. It does this in a loop
- // so it will reduce the size of the mask to the minimal width mask which
- // performs an equivalent shuffle.
- SmallVector<int, 64> WidenedMask;
- while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
- Mask = std::move(WidenedMask);
- }
+ if (Ops.size() <= 2) {
+ // Minor canonicalization of the accumulated shuffle mask to make it easier
+ // to match below. All this does is detect masks with sequential pairs of
+ // elements, and shrink them to the half-width mask. It does this in a loop
+ // so it will reduce the size of the mask to the minimal width mask which
+ // performs an equivalent shuffle.
+ SmallVector<int, 64> WidenedMask;
+ while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
+ Mask = std::move(WidenedMask);
+ }
+
+ // Canonicalization of binary shuffle masks to improve pattern matching by
+ // commuting the inputs.
+ if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(Ops[0], Ops[1]);
+ }
- // Canonicalization of binary shuffle masks to improve pattern matching by
- // commuting the inputs.
- if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
- ShuffleVectorSDNode::commuteMask(Mask);
- std::swap(Ops[0], Ops[1]);
+ // Finally, try to combine into a single shuffle instruction.
+ return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
+ AllowVariableMask, DAG, Subtarget);
}
- // Finally, try to combine into a single shuffle instruction.
- return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
- AllowVariableMask, DAG, Subtarget);
+ // If that failed and any input is extracted then try to combine as a
+ // shuffle with the larger type.
+ return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
+ HasVariableMask, AllowVariableMask,
+ DAG, Subtarget);
}
/// Helper entry wrapper to combineX86ShufflesRecursively.
; CHECK: # %bb.0:
; CHECK-NEXT: subq $72, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 80
-; CHECK-NEXT: vmovaps %xmm1, %xmm8
-; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925]
-; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm0
-; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm1
-; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm4
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm10[0,1],xmm2[1],xmm10[3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0],xmm1[1],xmm4[2,3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm0[0,1],xmm2[1],xmm0[3]
-; CHECK-NEXT: vextractf32x4 $3, %zmm3, %xmm7
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm11[0,1,2],xmm3[1]
-; CHECK-NEXT: vaddps %xmm5, %xmm0, %xmm12
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm4[0],xmm7[2],zero,zero
-; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1],xmm2[1],xmm7[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm7[0,1,2],xmm5[0]
-; CHECK-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[3,1,2,3]
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm4[0],xmm1[2],zero,zero
-; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0,1,2],xmm3[1]
-; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm9[0,1],xmm2[1],xmm9[3]
+; CHECK-NEXT: vmovaps %xmm1, %xmm9
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,22,1,17,4,22,1,17,4,22,1,17,4,22,1,17]
+; CHECK-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm15
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,30,1,22,4,30,1,22,4,30,1,22,4,30,1,22]
+; CHECK-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm10
+; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm7 = [85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925]
+; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7
+; CHECK-NEXT: vmovaps {{.*#+}} xmm8 = [4,28,1,29]
+; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm8
+; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,21,1,7]
+; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4
+; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm5
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm6
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[0,1],xmm2[1],xmm11[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm1[0,1,2],xmm3[1]
; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1]
; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0,1],xmm2[3],xmm10[3]
; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4
-; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3]
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
-; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm3[3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1,2],xmm3[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1]
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm2
-; CHECK-NEXT: vaddps %xmm9, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm13, %xmm1, %xmm9
-; CHECK-NEXT: vaddps %xmm12, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm3
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm3[3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm3[3]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm3[1,0]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[1]
+; CHECK-NEXT: vaddps %xmm1, %xmm3, %xmm8
+; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3],xmm11[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm13[0]
+; CHECK-NEXT: vaddps %xmm15, %xmm2, %xmm2
+; CHECK-NEXT: vmovaps %xmm14, %xmm1
+; CHECK-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vaddps %xmm10, %xmm14, %xmm10
+; CHECK-NEXT: vaddps %xmm14, %xmm14, %xmm3
+; CHECK-NEXT: vaddps %xmm12, %xmm15, %xmm0
+; CHECK-NEXT: vaddps %xmm8, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm14, %xmm0
; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %xmm9, (%rsp)
-; CHECK-NEXT: vmovaps %xmm8, %xmm3
+; CHECK-NEXT: vmovaps %xmm10, (%rsp)
+; CHECK-NEXT: vmovaps %xmm9, %xmm3
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
;
; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm1
-; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm2, %zmm0
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
+; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
;
;
; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm1
-; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm2, %zmm0
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
+; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
;
;
; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm1
-; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm2, %zmm0
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
+; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
;
;
; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm1
-; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm2, %zmm0
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
+; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
;