return true;
}
-/// \brief Check wether all of one set of inputs to a shuffle mask are in place.
-///
-/// Mask entries pointing at the other input or undef will be skipped.
-static bool isShuffleMaskInputInPlace(ArrayRef<int> Mask, bool LoInput = true) {
- int Size = Mask.size();
- for (int i = 0; i < Size; ++i) {
- int M = Mask[i];
- if (M == -1 || (LoInput && M >= 4) || (!LoInput && M < 4))
- continue;
- if (M - (LoInput ? 0 : Size) != i)
- return false;
- }
- return true;
-}
-
// Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
// 2013 will allow us to use it as a non-type template parameter.
namespace {
// INSERTPS when the V1 elements are already in the correct locations
// because otherwise we can just always use two SHUFPS instructions which
// are much smaller to encode than a SHUFPS and an INSERTPS.
- if (Subtarget->hasSSE41() &&
- isShuffleMaskInputInPlace(Mask, /*LoInput*/ true)) {
- // Insert the V2 element into the desired position.
- SDValue InsertPSMask =
- DAG.getIntPtrConstant(Mask[V2Index] << 6 | V2Index << 4);
- return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
- InsertPSMask);
+ if (Subtarget->hasSSE41()) {
+ // When using INSERTPS we can zero any lane of the destination. Collect
+ // the zero inputs into a mask and drop them from the lanes of V1 which
+ // actually need to be present as inputs to the INSERTPS.
+ unsigned ZMask = 0;
+ if (ISD::isBuildVectorAllZeros(V1.getNode())) {
+ ZMask = 0xF ^ (1 << V2Index);
+ } else if (V1.getOpcode() == ISD::BUILD_VECTOR) {
+ for (int i = 0; i < 4; ++i) {
+ int M = Mask[i];
+ if (M >= 4)
+ continue;
+ if (M > -1) {
+ SDValue Input = V1.getOperand(M);
+ if (Input.getOpcode() != ISD::UNDEF &&
+ !X86::isZeroNode(Input)) {
+ // A non-zero input!
+ ZMask = 0;
+ break;
+ }
+ }
+ ZMask |= 1 << i;
+ }
+ }
+
+ // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
+ int InsertShuffleMask[4] = {-1, -1, -1, -1};
+ for (int i = 0; i < 4; ++i)
+ if (i != V2Index && (ZMask & (1 << i)) == 0)
+ InsertShuffleMask[i] = Mask[i];
+
+ if (isNoopShuffleMask(InsertShuffleMask)) {
+ // Replace V1 with undef if nothing from V1 survives the INSERTPS.
+ if ((ZMask | 1 << V2Index) == 0xF)
+ V1 = DAG.getUNDEF(MVT::v4f32);
+
+ // Insert the V2 element into the desired position.
+ SDValue InsertPSMask =
+ DAG.getIntPtrConstant(Mask[V2Index] << 6 | V2Index << 4 | ZMask);
+ return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ InsertPSMask);
+ }
}
// Compute the index adjacent to V2Index and in the same half by toggling
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
ret <4 x i32> %shuffle
}
+
+define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_4zzz
+; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][1,0]
+; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,2],[[X]][2,3]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v4f32_4zzz
+; SSE41: insertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: @shuffle_v4f32_4zzz
+; AVX1: vinsertps {{.*}} # xmm0 = xmm0[0],zero,zero,zero
+; AVX1-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_z4zz
+; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][2,0]
+; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][3,0]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v4f32_z4zz
+; SSE41: insertps {{.*}} # xmm0 = zero,xmm0[0],zero,zero
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: @shuffle_v4f32_z4zz
+; AVX1: vinsertps {{.*}} # xmm0 = zero,xmm0[0],zero,zero
+; AVX1-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_zz4z
+; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[0,0],[[X]][0,0]
+; SSE2-NEXT: shufps {{.*}} # [[X]] = [[X]][0,0],xmm0[0,2]
+; SSE2-NEXT: movaps %[[X]], %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v4f32_zz4z
+; SSE41: insertps {{.*}} # xmm0 = zero,zero,xmm0[0],zero
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: @shuffle_v4f32_zz4z
+; AVX1: vinsertps {{.*}} # xmm0 = zero,zero,xmm0[0],zero
+; AVX1-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_zuu4
+; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT: shufps {{.*}} # [[X]] = [[X]][0,1],xmm0[2,0]
+; SSE2-NEXT: movaps %[[X]], %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v4f32_zuu4
+; SSE41: insertps {{.*}} # xmm0 = zero,zero,zero,xmm0[0]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: @shuffle_v4f32_zuu4
+; AVX1: vinsertps {{.*}} # xmm0 = zero,zero,zero,xmm0[0]
+; AVX1-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_zzz7
+; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[3,0],[[X]][2,0]
+; SSE2-NEXT: shufps {{.*}} # [[X]] = [[X]][0,1],xmm0[2,0]
+; SSE2-NEXT: movaps %[[X]], %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v4f32_zzz7
+; SSE41: insertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: @shuffle_v4f32_zzz7
+; AVX1: vinsertps {{.*}} # xmm0 = zero,zero,zero,xmm0[3]
+; AVX1-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
+; SSE2-LABEL: @shuffle_v4f32_z6zz
+; SSE2: xorps %[[X:xmm[0-9]+]], %[[X]]
+; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][0,0]
+; SSE2-NEXT: shufps {{.*}} # xmm0 = xmm0[2,0],[[X]][2,3]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: @shuffle_v4f32_z6zz
+; SSE41: insertps {{.*}} # xmm0 = zero,xmm0[2],zero,zero
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: @shuffle_v4f32_z6zz
+; AVX1: vinsertps {{.*}} # xmm0 = zero,xmm0[2],zero,zero
+; AVX1-NEXT: retq
+ %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+ ret <4 x float> %shuffle
+}