From e610c324e1a56843abdff27a1bf30d188255a23b Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Sun, 14 Sep 2014 23:43:33 +0000 Subject: [PATCH] [x86] Teach the new vector shuffle lowering to use BLENDPS and BLENDPD. These are super simple. They even take precedence over crazy instructions like INSERTPS because they have very high throughput on modern x86 chips. I still have to teach the integer shuffle variants about this to avoid so many domain crossings. However, due to the particular instructions available, that's a touch more complex and so a separate patch. Also, the backend doesn't seem to realize it can commute blend instructions by negating the mask. That would help remove a number of copies here. Suggestions on how to do this welcome, it's an area I'm less familiar with. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217744 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 35 ++++++++ test/CodeGen/X86/vector-shuffle-128-v2.ll | 100 +++++++++++++++++----- test/CodeGen/X86/vector-shuffle-128-v4.ll | 14 ++- test/CodeGen/X86/vector-shuffle-256-v4.ll | 22 ++--- 4 files changed, 134 insertions(+), 37 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f3774321a07..8c9d8711d1f 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7233,6 +7233,31 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef Mask, return DAG.getConstant(Imm, MVT::i8); } +/// \brief Try to emit a blend instruction for a shuffle. +/// +/// This doesn't do any checks for the availability of instructions for blending +/// these values. It relies on the availability of the X86ISD::BLENDI pattern to +/// be matched in the backend with the type given. What it does check for is +/// that the shuffle mask is in fact a blend. +static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef Mask, + SelectionDAG &DAG) { + + unsigned BlendMask = 0; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Mask[i] >= Size) { + if (Mask[i] != i + Size) + return SDValue(); // Shuffled V2 input! + BlendMask |= 1u << i; + continue; + } + if (Mask[i] >= 0 && Mask[i] != i) + return SDValue(); // Shuffled V1 input! + } + return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, + DAG.getConstant(BlendMask, MVT::i8)); +} + /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full @@ -7267,6 +7292,11 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (isShuffleEquivalent(Mask, 1, 3)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); + if (Subtarget->hasSSE41()) + if (SDValue Blend = + lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, DAG)) + return Blend; + unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2, DAG.getConstant(SHUFPDMask, MVT::i8)); @@ -7353,6 +7383,11 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (isShuffleEquivalent(Mask, 2, 6, 3, 7)) return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); + if (Subtarget->hasSSE41()) + if (SDValue Blend = + lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, DAG)) + return Blend; + if (NumV2Elements == 1) { int V2Index = std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index 619105f5026..f6382a98559 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -111,17 +111,35 @@ define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) { ret <2 x double> %shuffle } define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) { -; ALL-LABEL: @shuffle_v2f64_03 -; ALL: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] -; ALL-NEXT: retq +; SSE2-LABEL: @shuffle_v2f64_03 +; SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; SSE2-NEXT: retq +; +; SSE3-LABEL: @shuffle_v2f64_03 +; SSE3: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; SSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v2f64_03 +; SSE41: blendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle } define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) { -; ALL-LABEL: @shuffle_v2f64_21 -; ALL: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1] -; ALL-NEXT: movapd %xmm1, %xmm0 -; ALL-NEXT: retq +; SSE2-LABEL: @shuffle_v2f64_21 +; SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: @shuffle_v2f64_21 +; SSE3: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1] +; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v2f64_21 +; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm0[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq %shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle } @@ -143,17 +161,35 @@ define <2 x i64> @shuffle_v2i64_02_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64 ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) { -; ALL-LABEL: @shuffle_v2i64_03 -; ALL: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] -; ALL-NEXT: retq +; SSE2-LABEL: @shuffle_v2i64_03 +; SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; SSE2-NEXT: retq +; +; SSE3-LABEL: @shuffle_v2i64_03 +; SSE3: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; SSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v2i64_03 +; SSE41: blendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; ALL-LABEL: @shuffle_v2i64_03_copy -; ALL: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1] -; ALL-NEXT: movapd %xmm1, %xmm0 -; ALL-NEXT: retq +; SSE2-LABEL: @shuffle_v2i64_03_copy +; SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: @shuffle_v2i64_03_copy +; SSE3: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1] +; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v2i64_03_copy +; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm2[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } @@ -204,18 +240,38 @@ define <2 x i64> @shuffle_v2i64_20_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64 ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) { -; ALL-LABEL: @shuffle_v2i64_21 -; ALL: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1] -; ALL-NEXT: movapd %xmm1, %xmm0 -; ALL-NEXT: retq +; SSE2-LABEL: @shuffle_v2i64_21 +; SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: @shuffle_v2i64_21 +; SSE3: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1] +; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v2i64_21 +; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm0[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) { -; ALL-LABEL: @shuffle_v2i64_21_copy -; ALL: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1] -; ALL-NEXT: movapd %xmm2, %xmm0 -; ALL-NEXT: retq +; SSE2-LABEL: @shuffle_v2i64_21_copy +; SSE2: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1] +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE3-LABEL: @shuffle_v2i64_21_copy +; SSE3: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1] +; SSE3-NEXT: movapd %xmm2, %xmm0 +; SSE3-NEXT: retq +; +; SSE41-LABEL: @shuffle_v2i64_21_copy +; SSE41: blendpd {{.*}} # xmm2 = xmm2[0],xmm1[1] +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle } diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index 9105197f67c..d5bb55a2caa 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -216,11 +216,14 @@ define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) { ; SSE2-NEXT: retq ; ; SSE41-LABEL: @shuffle_v4f32_4zzz -; SSE41: insertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero +; SSE41: xorps %[[X:xmm[0-9]+]], %[[X]] +; SSE41-NEXT: blendps {{.*}} # [[X]] = xmm0[0],[[X]][1,2,3] +; SSE41-NEXT: movaps %[[X]], %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: @shuffle_v4f32_4zzz -; AVX1: vinsertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero +; AVX1: vxorps %[[X:xmm[0-9]+]], %[[X]] +; AVX1-NEXT: vblendps {{.*}} # xmm0 = xmm0[0],[[X]][1,2,3] ; AVX1-NEXT: retq %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> ret <4 x float> %shuffle @@ -290,11 +293,14 @@ define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) { ; SSE2-NEXT: retq ; ; SSE41-LABEL: @shuffle_v4f32_zzz7 -; SSE41: insertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3] +; SSE41: xorps %[[X:xmm[0-9]+]], %[[X]] +; SSE41-NEXT: blendps {{.*}} # [[X]] = [[X]][0,1,2],xmm0[3] +; SSE41-NEXT: movaps %[[X]], %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: @shuffle_v4f32_zzz7 -; AVX1: vinsertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3] +; AVX1: vxorps %[[X:xmm[0-9]+]], %[[X]] +; AVX1-NEXT: vblendps {{.*}} # xmm0 = [[X]][0,1,2],xmm0[3] ; AVX1-NEXT: retq %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> ret <4 x float> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index a21b78985d7..cd79a38ca4a 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -40,7 +40,7 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_0300 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[0],xmm1[1] +; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1] ; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -119,7 +119,7 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) { ; AVX1-LABEL: @shuffle_v4f64_0300 ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[0],xmm1[1] +; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1] ; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -282,7 +282,7 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0] -; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm2[0],xmm1[1] +; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm2[0],xmm1[1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -293,7 +293,7 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) { ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm2 = xmm2[0,0] -; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm1[0],xmm2[1] +; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm1[0],xmm2[1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -305,7 +305,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0] ; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0] -; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -317,7 +317,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0] ; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0] -; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm1[0],xmm0[1] +; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -335,9 +335,9 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_0451 ; AVX1: # BB#0: ; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm1[2,3,0,1] -; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm2[0],xmm0[1] +; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm0[1] ; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0] -; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -355,9 +355,9 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: @shuffle_v4i64_4015 ; AVX1: # BB#0: ; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm2[0],xmm1[1] +; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm1[1] ; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0] -; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm1[0],xmm0[1] +; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> @@ -370,7 +370,7 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) { ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-NEXT: vpunpckhqdq {{.*}} # xmm0 = xmm0[1,1] ; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm0[0],xmm1[1] +; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1] ; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[2,3,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq -- 2.40.0