From: Simon Pilgrim Date: Wed, 3 Jul 2019 15:46:08 +0000 (+0000) Subject: [X86][AVX] combineX86ShufflesRecursively - peek through extract_subvector X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4393b17a68582ed9208d8ee6a0b2d2a5f0d9982e;p=llvm [X86][AVX] combineX86ShufflesRecursively - peek through extract_subvector If we have more then 2 shuffle ops to combine, try to use combineX86ShuffleChainWithExtract to see if some are from the same super vector. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@365050 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 3e75756cffd..fc264a74975 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -32738,29 +32738,34 @@ static SDValue combineX86ShufflesRecursively( return Cst; // We can only combine unary and binary shuffle mask cases. - if (Ops.size() > 2) - return SDValue(); - - // Minor canonicalization of the accumulated shuffle mask to make it easier - // to match below. All this does is detect masks with sequential pairs of - // elements, and shrink them to the half-width mask. It does this in a loop - // so it will reduce the size of the mask to the minimal width mask which - // performs an equivalent shuffle. - SmallVector WidenedMask; - while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { - Mask = std::move(WidenedMask); - } + if (Ops.size() <= 2) { + // Minor canonicalization of the accumulated shuffle mask to make it easier + // to match below. All this does is detect masks with sequential pairs of + // elements, and shrink them to the half-width mask. It does this in a loop + // so it will reduce the size of the mask to the minimal width mask which + // performs an equivalent shuffle. + SmallVector WidenedMask; + while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) { + Mask = std::move(WidenedMask); + } + + // Canonicalization of binary shuffle masks to improve pattern matching by + // commuting the inputs. + if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) { + ShuffleVectorSDNode::commuteMask(Mask); + std::swap(Ops[0], Ops[1]); + } - // Canonicalization of binary shuffle masks to improve pattern matching by - // commuting the inputs. - if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) { - ShuffleVectorSDNode::commuteMask(Mask); - std::swap(Ops[0], Ops[1]); + // Finally, try to combine into a single shuffle instruction. + return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, + AllowVariableMask, DAG, Subtarget); } - // Finally, try to combine into a single shuffle instruction. - return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, - AllowVariableMask, DAG, Subtarget); + // If that failed and any input is extracted then try to combine as a + // shuffle with the larger type. + return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth, + HasVariableMask, AllowVariableMask, + DAG, Subtarget); } /// Helper entry wrapper to combineX86ShufflesRecursively. diff --git a/test/CodeGen/X86/pr29112.ll b/test/CodeGen/X86/pr29112.ll index 2dce179f367..6ce9da9ca57 100644 --- a/test/CodeGen/X86/pr29112.ll +++ b/test/CodeGen/X86/pr29112.ll @@ -10,53 +10,52 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK: # %bb.0: ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: vmovaps %xmm1, %xmm8 -; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925] -; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm0 -; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm1 -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm4 -; CHECK-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm10[0,1],xmm2[1],xmm10[3] -; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0],xmm1[1],xmm4[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm0[0,1],xmm2[1],xmm0[3] -; CHECK-NEXT: vextractf32x4 $3, %zmm3, %xmm7 -; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm11[0,1,2],xmm3[1] -; CHECK-NEXT: vaddps %xmm5, %xmm0, %xmm12 -; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] -; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm4[0],xmm7[2],zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1],xmm2[1],xmm7[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm7[0,1,2],xmm5[0] -; CHECK-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[3,1,2,3] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm4[0],xmm1[2],zero,zero -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0,1,2],xmm3[1] -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm9[0,1],xmm2[1],xmm9[3] +; CHECK-NEXT: vmovaps %xmm1, %xmm9 +; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,22,1,17,4,22,1,17,4,22,1,17,4,22,1,17] +; CHECK-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm15 +; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,30,1,22,4,30,1,22,4,30,1,22,4,30,1,22] +; CHECK-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm10 +; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm7 = [85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925] +; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7 +; CHECK-NEXT: vmovaps {{.*#+}} xmm8 = [4,28,1,29] +; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm8 +; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,21,1,7] +; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4 +; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm5 +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm6 +; CHECK-NEXT: vunpcklps {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[0,1],xmm2[1],xmm11[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm1[0,1,2],xmm3[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1] ; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0,1],xmm2[3],xmm10[3] ; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4 -; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3] +; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] +; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,1,2,3] +; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1] -; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm3[3] -; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1,2],xmm3[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm2 -; CHECK-NEXT: vaddps %xmm9, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm13, %xmm1, %xmm9 -; CHECK-NEXT: vaddps %xmm12, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm3 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3] +; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm3[3] +; CHECK-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm3[3] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm3[1,0] +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[1] +; CHECK-NEXT: vaddps %xmm1, %xmm3, %xmm8 +; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3],xmm11[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm13[0] +; CHECK-NEXT: vaddps %xmm15, %xmm2, %xmm2 +; CHECK-NEXT: vmovaps %xmm14, %xmm1 +; CHECK-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vaddps %xmm10, %xmm14, %xmm10 +; CHECK-NEXT: vaddps %xmm14, %xmm14, %xmm3 +; CHECK-NEXT: vaddps %xmm12, %xmm15, %xmm0 +; CHECK-NEXT: vaddps %xmm8, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm14, %xmm0 ; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovaps %xmm9, (%rsp) -; CHECK-NEXT: vmovaps %xmm8, %xmm3 +; CHECK-NEXT: vmovaps %xmm10, (%rsp) +; CHECK-NEXT: vmovaps %xmm9, %xmm3 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll b/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll index d6648da025e..f9eae50039c 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll @@ -716,11 +716,9 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ ; ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785] -; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm1 -; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321] -; AVX512VBMI-NEXT: vpermb %zmm0, %zmm2, %zmm0 -; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq ; @@ -801,11 +799,9 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ ; ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785] -; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm1 -; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257] -; AVX512VBMI-NEXT: vpermb %zmm0, %zmm2, %zmm0 -; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62] +; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq ; diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll index a98dbfd317b..8e0c1d79672 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -707,11 +707,9 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ ; ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785] -; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm1 -; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321] -; AVX512VBMI-NEXT: vpermb %zmm0, %zmm2, %zmm0 -; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] +; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq ; @@ -792,11 +790,9 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_ ; ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785] -; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm1 -; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257] -; AVX512VBMI-NEXT: vpermb %zmm0, %zmm2, %zmm0 -; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62] +; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll index eb096ff1de7..b74a0b1c0fe 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -322,12 +322,9 @@ define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) { define <4 x i32> @test_v16i32_0_4_8_12(<16 x i32> %v) { ; ALL-LABEL: test_v16i32_0_4_8_12: ; ALL: # %bb.0: -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; ALL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vbroadcastsd {{.*#+}} ymm2 = [17179869184,17179869184,17179869184,17179869184] -; ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [0,4,8,12] +; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %res = shufflevector <16 x i32> %v, <16 x i32> undef, <4 x i32>