From: Gadi Haber Date: Mon, 27 Mar 2017 12:13:37 +0000 (+0000) Subject: [X86][AVX2] bugzilla bug 21281 Performance regression in vector interleave in AVX2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=cd2a3f9e73d722f979390cbaf6c6c116caca82da;p=llvm [X86][AVX2] bugzilla bug 21281 Performance regression in vector interleave in AVX2 This is a patch for an on-going bugzilla bug 21281 on the generated X86 code for a matrix transpose8x8 subroutine which requires vector interleaving. The generated code in AVX2 is currently non-optimal and requires 60 instructions as opposed to only 40 instructions generated for AVX1. The patch includes a fix for the AVX2 case where vector unpack instructions use less operations than the vector blend operations available in AVX2. In this case using vector unpack instructions is more efficient. Reviewers: zvi delena igorb craig.topper guyblank eladcohen m_zuckerman aymanmus RKSimon git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@298840 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1cb81d338bd..c77dd9517bb 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -8207,6 +8207,23 @@ static bool isTargetShuffleEquivalent(ArrayRef Mask, return true; } +// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd +// instructions. +static bool isUnpackWdShuffleMask(ArrayRef Mask, MVT VT) { + if (VT != MVT::v8i32 && VT != MVT::v8f32) + return false; + + SmallVector Unpcklwd; + createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true, + /* Unary = */ false); + SmallVector Unpckhwd; + createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false, + /* Unary = */ false); + bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) || + isTargetShuffleEquivalent(Mask, Unpckhwd)); + return IsUnpackwdMask; +} + /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to @@ -12740,6 +12757,14 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef Mask, V1, V2, DAG, Subtarget)) return V; + // For non-AVX512 if the Mask is of 16bit elements in lane then try to split + // since after split we get a more efficient code using vpunpcklwd and + // vpunpckhwd instrs than vblend. + if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32)) + if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, + Mask, DAG)) + return V; + // If we have AVX2 then we always want to lower with a blend because at v8 we // can fully permute the elements. if (Subtarget.hasAVX2()) @@ -12771,6 +12796,14 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef Mask, DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return ZExt; + // For non-AVX512 if the Mask is of 16bit elements in lane then try to split + // since after split we get a more efficient code than vblend by using + // vpunpcklwd and vpunpckhwd instrs. + if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !Subtarget.hasAVX512()) + if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, + Mask, DAG)) + return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Blend; diff --git a/test/CodeGen/X86/vector-interleave.ll b/test/CodeGen/X86/vector-interleave.ll index 4f9dbb03fb1..1265ea10897 100644 --- a/test/CodeGen/X86/vector-interleave.ll +++ b/test/CodeGen/X86/vector-interleave.ll @@ -93,44 +93,32 @@ define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x ; AVX2: # BB#0: ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-NEXT: vpermd %ymm1, %ymm9, %ymm3 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2],ymm1[3],ymm8[4],ymm1[5],ymm8[6],ymm1[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm6, %ymm5 -; AVX2-NEXT: vpermd %ymm5, %ymm9, %ymm6 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3],ymm4[4],ymm6[5],ymm4[6],ymm6[7] -; AVX2-NEXT: vpermd %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 ; AVX2-NEXT: retq %ab = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> %cd = shufflevector <8 x i16> %c, <8 x i16> %d, <16 x i32> diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index 2f5b011a183..0928a3c66b3 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -266,21 +266,12 @@ define <8 x float> @shuffle_v8f32_2a3b6e7f(<8 x float> %a, <8 x float> %b) { } define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) { -; AVX1-LABEL: shuffle_v8f32_08192a3b: -; AVX1: # BB#0: -; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v8f32_08192a3b: -; AVX2: # BB#0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = -; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u> -; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-NEXT: retq +; AVX1OR2-LABEL: shuffle_v8f32_08192a3b: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1OR2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8f32_08192a3b: ; AVX512VL: # BB#0: @@ -1221,10 +1212,9 @@ define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) { ; ; AVX2-LABEL: shuffle_v8i32_08192a3b: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_08192a3b: