return DAG.getConstant(Imm, MVT::i8);
}
+/// \brief Try to emit a blend instruction for a shuffle.
+///
+/// This doesn't do any checks for the availability of instructions for blending
+/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
+/// be matched in the backend with the type given. What it does check for is
+/// that the shuffle mask is in fact a blend.
+static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+
+ unsigned BlendMask = 0;
+ for (int i = 0, Size = Mask.size(); i < Size; ++i) {
+ if (Mask[i] >= Size) {
+ if (Mask[i] != i + Size)
+ return SDValue(); // Shuffled V2 input!
+ BlendMask |= 1u << i;
+ continue;
+ }
+ if (Mask[i] >= 0 && Mask[i] != i)
+ return SDValue(); // Shuffled V1 input!
+ }
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
+ DAG.getConstant(BlendMask, MVT::i8));
+}
+
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
if (isShuffleEquivalent(Mask, 1, 3))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2);
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend =
+ lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, DAG))
+ return Blend;
+
unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2,
DAG.getConstant(SHUFPDMask, MVT::i8));
if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2);
+ if (Subtarget->hasSSE41())
+ if (SDValue Blend =
+ lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, DAG))
+ return Blend;
+
if (NumV2Elements == 1) {
int V2Index =
std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
ret <2 x double> %shuffle
}
define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) {
-; ALL-LABEL: @shuffle_v2f64_03
-; ALL: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
-; ALL-NEXT: retq
+; SSE2-LABEL: @shuffle_v2f64_03
+; SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: @shuffle_v2f64_03
+; SSE3: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; SSE3-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v2f64_03
+; SSE41: blendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
ret <2 x double> %shuffle
}
define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) {
-; ALL-LABEL: @shuffle_v2f64_21
-; ALL: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
-; ALL-NEXT: movapd %xmm1, %xmm0
-; ALL-NEXT: retq
+; SSE2-LABEL: @shuffle_v2f64_21
+; SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: @shuffle_v2f64_21
+; SSE3: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
+; SSE3-NEXT: movapd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v2f64_21
+; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 1>
ret <2 x double> %shuffle
}
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) {
-; ALL-LABEL: @shuffle_v2i64_03
-; ALL: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
-; ALL-NEXT: retq
+; SSE2-LABEL: @shuffle_v2i64_03
+; SSE2: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: @shuffle_v2i64_03
+; SSE3: shufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; SSE3-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v2i64_03
+; SSE41: blendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; ALL-LABEL: @shuffle_v2i64_03_copy
-; ALL: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
-; ALL-NEXT: movapd %xmm1, %xmm0
-; ALL-NEXT: retq
+; SSE2-LABEL: @shuffle_v2i64_03_copy
+; SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: @shuffle_v2i64_03_copy
+; SSE3: shufpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
+; SSE3-NEXT: movapd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v2i64_03_copy
+; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
ret <2 x i64> %shuffle
}
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) {
-; ALL-LABEL: @shuffle_v2i64_21
-; ALL: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
-; ALL-NEXT: movapd %xmm1, %xmm0
-; ALL-NEXT: retq
+; SSE2-LABEL: @shuffle_v2i64_21
+; SSE2: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: @shuffle_v2i64_21
+; SSE3: shufpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
+; SSE3-NEXT: movapd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v2i64_21
+; SSE41: blendpd {{.*}} # xmm1 = xmm1[0],xmm0[1]
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
-; ALL-LABEL: @shuffle_v2i64_21_copy
-; ALL: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
-; ALL-NEXT: movapd %xmm2, %xmm0
-; ALL-NEXT: retq
+; SSE2-LABEL: @shuffle_v2i64_21_copy
+; SSE2: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
+; SSE2-NEXT: movapd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: @shuffle_v2i64_21_copy
+; SSE3: shufpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
+; SSE3-NEXT: movapd %xmm2, %xmm0
+; SSE3-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v2i64_21_copy
+; SSE41: blendpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
+; SSE41-NEXT: movapd %xmm2, %xmm0
+; SSE41-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
ret <2 x i64> %shuffle
}
; SSE2-NEXT: retq
;
; SSE41-LABEL: @shuffle_v4f32_4zzz
-; SSE41: insertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero
+; SSE41: xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE41-NEXT: blendps {{.*}} # [[X]] = xmm0[0],[[X]][1,2,3]
+; SSE41-NEXT: movaps %[[X]], %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: @shuffle_v4f32_4zzz
-; AVX1: vinsertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero
+; AVX1: vxorps %[[X:xmm[0-9]+]], %[[X]]
+; AVX1-NEXT: vblendps {{.*}} # xmm0 = xmm0[0],[[X]][1,2,3]
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
ret <4 x float> %shuffle
; SSE2-NEXT: retq
;
; SSE41-LABEL: @shuffle_v4f32_zzz7
-; SSE41: insertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3]
+; SSE41: xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE41-NEXT: blendps {{.*}} # [[X]] = [[X]][0,1,2],xmm0[3]
+; SSE41-NEXT: movaps %[[X]], %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: @shuffle_v4f32_zzz7
-; AVX1: vinsertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3]
+; AVX1: vxorps %[[X:xmm[0-9]+]], %[[X]]
+; AVX1-NEXT: vblendps {{.*}} # xmm0 = [[X]][0,1,2],xmm0[3]
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
ret <4 x float> %shuffle
; AVX1-LABEL: @shuffle_v4i64_0300
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
; AVX1-LABEL: @shuffle_v4f64_0300
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
-; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm2[0],xmm1[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm2[0],xmm1[1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm2 = xmm2[0,0]
-; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0]
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
-; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0]
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
-; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
; AVX1-LABEL: @shuffle_v4i64_0451
; AVX1: # BB#0:
; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm1[2,3,0,1]
-; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm2[0],xmm0[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm0[1]
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
-; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
; AVX1-LABEL: @shuffle_v4i64_4015
; AVX1: # BB#0:
; AVX1-NEXT: vpshufd {{.*}} # xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT: vshufpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
; AVX1-NEXT: vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
-; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vpunpckhqdq {{.*}} # xmm0 = xmm0[1,1]
; AVX1-NEXT: vpshufd {{.*}} # xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vshufpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
+; AVX1-NEXT: vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
; AVX1-NEXT: vpshufd {{.*}} # xmm1 = xmm1[2,3,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq