From: Simon Pilgrim Date: Sun, 3 Feb 2019 16:51:33 +0000 (+0000) Subject: [X86][AVX] Support shuffle combining for VBROADCAST with smaller vector sources X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=39a2370f4eb11e377373be8b87bd41a34761b24a;p=llvm [X86][AVX] Support shuffle combining for VBROADCAST with smaller vector sources getTargetShuffleMask can only do this safely if we're extracting the lowest subvector from a vector of the same result type. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@352999 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index dbc4947aaf8..50502e2911e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -6773,6 +6773,26 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl &Mask, } return true; } + case X86ISD::VBROADCAST: { + SDValue Src = N.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + if (!SrcVT.isVector()) + return false; + + if (NumSizeInBits != SrcVT.getSizeInBits()) { + assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 && + "Illegal broadcast type"); + SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), + NumSizeInBits / SrcVT.getScalarSizeInBits()); + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT, + DAG.getUNDEF(SrcVT), Src, + DAG.getIntPtrConstant(0, SDLoc(N))); + } + + Ops.push_back(Src); + Mask.append(NumElts, 0); + return true; + } case ISD::ZERO_EXTEND_VECTOR_INREG: case ISD::ZERO_EXTEND: { SDValue Src = N.getOperand(0); diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index 4d285006a73..174a9e5bd32 100644 --- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -2174,8 +2174,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; CHECK-NEXT: vpbroadcastq %xmm3, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,7] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4] ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4 ; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1 ; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} @@ -2189,9 +2188,8 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64 define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; CHECK-NEXT: vpbroadcastq %xmm2, %ymm3 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,7] +; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4] ; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1 ; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z} ; CHECK-NEXT: vmovdqa %ymm2, %ymm0 @@ -3806,9 +3804,8 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double> define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; CHECK-NEXT: vbroadcastsd %xmm1, %ymm2 -; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,1,7] +; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,1,4] ; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 ; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -3819,8 +3816,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %v ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; CHECK-NEXT: vbroadcastsd %xmm3, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,1,7] +; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,1,4] ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4 ; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 @@ -3835,9 +3831,8 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %v define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; CHECK-NEXT: vbroadcastsd %xmm2, %ymm3 -; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,1,7] +; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,1,4] ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 ; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z} diff --git a/test/CodeGen/X86/vector-shuffle-avx512.ll b/test/CodeGen/X86/vector-shuffle-avx512.ll index ee0af90d093..2092b3bf453 100644 --- a/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -190,9 +190,10 @@ define <4 x i64> @expand4(<2 x i64> %a ) { define <8 x float> @expand5(<4 x float> %a ) { ; SKX64-LABEL: expand5: ; SKX64: # %bb.0: -; SKX64-NEXT: vbroadcastss %xmm0, %ymm0 +; SKX64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; SKX64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; SKX64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; SKX64-NEXT: vmovaps {{.*#+}} ymm2 = [8,0,10,0,12,0,14,0] +; SKX64-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; SKX64-NEXT: retq ; ; KNL64-LABEL: expand5: @@ -204,9 +205,10 @@ define <8 x float> @expand5(<4 x float> %a ) { ; ; SKX32-LABEL: expand5: ; SKX32: # %bb.0: -; SKX32-NEXT: vbroadcastss %xmm0, %ymm0 +; SKX32-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; SKX32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; SKX32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; SKX32-NEXT: vmovaps {{.*#+}} ymm2 = [8,0,10,0,12,0,14,0] +; SKX32-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0 ; SKX32-NEXT: retl ; ; KNL32-LABEL: expand5: