isa<ConstantSDNode>(V2.getOperand(1))) {
SDValue Src1 = V1.getOperand(0);
SDValue Src2 = V2.getOperand(0);
- if (Src1 == Src2) {
+ if (Src1.getValueType() == Src2.getValueType()) {
unsigned Offset1 = V1.getConstantOperandVal(1);
unsigned Offset2 = V2.getConstantOperandVal(1);
assert(((Offset1 % VT1.getVectorNumElements()) == 0 ||
(Offset2 % VT2.getVectorNumElements()) == 0 ||
(Src1.getValueSizeInBits() % RootSizeInBits) == 0) &&
"Unexpected subvector extraction");
+ unsigned Scale = Src1.getValueSizeInBits() / RootSizeInBits;
+
// Convert extraction indices to mask size.
Offset1 /= VT1.getVectorNumElements();
Offset2 /= VT2.getVectorNumElements();
Offset1 *= NumMaskElts;
Offset2 *= NumMaskElts;
+ SmallVector<SDValue, 2> NewInputs;
+ NewInputs.push_back(Src1);
+ if (Src1 != Src2) {
+ NewInputs.push_back(Src2);
+ Offset2 += Scale * NumMaskElts;
+ }
+
// Create new mask for larger type.
SmallVector<int, 64> NewMask(Mask);
for (int &M : NewMask) {
else
M = (M - NumMaskElts) + Offset2;
}
- unsigned Scale = Src1.getValueSizeInBits() / RootSizeInBits;
NewMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
- SDValue NewInputs[] = {Src1};
if (SDValue Res = combineX86ShuffleChain(
NewInputs, Src1, NewMask, Depth, HasVariableMask,
AllowVariableMask, DAG, Subtarget)) {
define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9:
; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,3]
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1,0,2]
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vpermd %ymm2, %ymm1, %ymm1
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,3]
-; CHECK-NEXT: vpermi2d %xmm2, %xmm1, %xmm0
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpermd %ymm3, %ymm1, %ymm1
+; CHECK-NEXT: vpermt2d %ymm0, %ymm2, %ymm1
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 12, i32 9, i32 4, i32 10>
define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [0,6,2,6]
-; CHECK-NEXT: vpermi2ps %xmm0, %xmm3, %xmm4
+; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,14,2,14]
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm4
+; CHECK-NEXT: vpermt2ps %ymm0, %ymm3, %ymm4
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vcmpeqps %xmm0, %xmm2, %k1
; CHECK-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1}
define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,6,2,6]
+; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [0,14,2,14]
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
-; CHECK-NEXT: vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermt2ps %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
; CHECK-NEXT: subq $72, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 80
; CHECK-NEXT: vmovaps %xmm1, %xmm8
+; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925]
+; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm0
; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm1
-; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm5
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm10[0,1],xmm2[1],xmm10[3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0],xmm1[1],xmm5[2,3]
-; CHECK-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm6[0,1],xmm2[1],xmm6[3]
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm4
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm10[0,1],xmm2[1],xmm10[3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0],xmm1[1],xmm4[2,3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm0[0,1],xmm2[1],xmm0[3]
; CHECK-NEXT: vextractf32x4 $3, %zmm3, %xmm7
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm2[1],xmm4[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm11[0,1,2],xmm3[1]
-; CHECK-NEXT: vaddps %xmm4, %xmm6, %xmm12
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0],xmm7[2],zero,zero
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm11[0,1,2],xmm3[1]
+; CHECK-NEXT: vaddps %xmm5, %xmm0, %xmm12
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm4[0],xmm7[2],zero,zero
; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1],xmm2[1],xmm7[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm7[0,1,2],xmm4[0]
-; CHECK-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[3,1,2,3]
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0],xmm1[2],zero,zero
-; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1,2],xmm3[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm7[0,1,2],xmm5[0]
+; CHECK-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[3,1,2,3]
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm4[0],xmm1[2],zero,zero
+; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0,1,2],xmm3[1]
; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm9[0,1],xmm2[1],xmm9[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm5[0,1,2],xmm3[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm9[0,1],xmm2[1],xmm9[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1]
; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm4[0,1],xmm2[1],xmm4[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm7[0,1],xmm2[1],xmm7[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0,1],xmm2[3],xmm10[3]
; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4
-; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1,2],xmm4[3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm3[3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm10 = xmm0[0,1,2],xmm3[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[0,1,2],xmm3[1]
+; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1,2],xmm3[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1]
; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm2
-; CHECK-NEXT: vaddps %xmm10, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm9, %xmm0, %xmm0
; CHECK-NEXT: vaddps %xmm13, %xmm1, %xmm9
; CHECK-NEXT: vaddps %xmm12, %xmm0, %xmm0
; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm3
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16]
-; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
+; AVX512VLVBMI-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,32,32,32,32,32,32,32,32,15,15,15,15,15,15,15,15,32,32,32,32,32,32,32,32]
+; AVX512VLVBMI-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
; AVX512VLVBMI-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <32 x i8> %shuffle