return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V);
}
+// Check for whether we can use INSERTPS to perform the shuffle. We only use
+// INSERTPS when the V1 elements are already in the correct locations
+// because otherwise we can just always use two SHUFPS instructions which
+// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
+// perform INSERTPS if a single V1 element is out of place and all V2
+// elements are zeroable.
+static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!");
+ assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
+ assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
+
+ SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+
+ unsigned ZMask = 0;
+ int V1DstIndex = -1;
+ int V2DstIndex = -1;
+ bool V1UsedInPlace = false;
+
+ for (int i = 0; i < 4; i++) {
+ // Synthesize a zero mask from the zeroable elements (includes undefs).
+ if (Zeroable[i]) {
+ ZMask |= 1 << i;
+ continue;
+ }
+
+ // Flag if we use any V1 inputs in place.
+ if (i == Mask[i]) {
+ V1UsedInPlace = true;
+ continue;
+ }
+
+ // We can only insert a single non-zeroable element.
+ if (V1DstIndex != -1 || V2DstIndex != -1)
+ return SDValue();
+
+ if (Mask[i] < 4) {
+ // V1 input out of place for insertion.
+ V1DstIndex = i;
+ } else {
+ // V2 input for insertion.
+ V2DstIndex = i;
+ }
+ }
+
+ // Don't bother if we have no (non-zeroable) element for insertion.
+ if (V1DstIndex == -1 && V2DstIndex == -1)
+ return SDValue();
+
+ // Determine element insertion src/dst indices. The src index is from the
+ // start of the inserted vector, not the start of the concatenated vector.
+ unsigned V2SrcIndex = 0;
+ if (V1DstIndex != -1) {
+ // If we have a V1 input out of place, we use V1 as the V2 element insertion
+ // and don't use the original V2 at all.
+ V2SrcIndex = Mask[V1DstIndex];
+ V2DstIndex = V1DstIndex;
+ V2 = V1;
+ } else {
+ V2SrcIndex = Mask[V2DstIndex] - 4;
+ }
+
+ // If no V1 inputs are used in place, then the result is created only from
+ // the zero mask and the V2 insertion - so remove V1 dependency.
+ if (!V1UsedInPlace)
+ V1 = DAG.getUNDEF(MVT::v4f32);
+
+ unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask;
+ assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
+
+ // Insert the V2 element into the desired position.
+ SDLoc DL(Op);
+ return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
+ DAG.getConstant(InsertPSMask, MVT::i8));
+}
+
/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
///
/// This is the basis function for the 2-lane 64-bit shuffles as we have full
Mask, Subtarget, DAG))
return V;
- if (Subtarget->hasSSE41())
+ if (Subtarget->hasSSE41()) {
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
- // Check for whether we can use INSERTPS to perform the blend. We only use
- // INSERTPS when the V1 elements are already in the correct locations
- // because otherwise we can just always use two SHUFPS instructions which
- // are much smaller to encode than a SHUFPS and an INSERTPS.
- if (NumV2Elements == 1 && Subtarget->hasSSE41()) {
- int V2Index =
- std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) -
- Mask.begin();
-
- // When using INSERTPS we can zero any lane of the destination. Collect
- // the zero inputs into a mask and drop them from the lanes of V1 which
- // actually need to be present as inputs to the INSERTPS.
- SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-
- // Synthesize a shuffle mask for the non-zero and non-v2 inputs.
- bool InsertNeedsShuffle = false;
- unsigned ZMask = 0;
- for (int i = 0; i < 4; ++i)
- if (i != V2Index) {
- if (Zeroable[i]) {
- ZMask |= 1 << i;
- } else if (Mask[i] != i) {
- InsertNeedsShuffle = true;
- break;
- }
- }
-
- // We don't want to use INSERTPS or other insertion techniques if it will
- // require shuffling anyways.
- if (!InsertNeedsShuffle) {
- // If all of V1 is zeroable, replace it with undef.
- if ((ZMask | 1 << V2Index) == 0xF)
- V1 = DAG.getUNDEF(MVT::v4f32);
-
- unsigned InsertPSMask = (Mask[V2Index] - 4) << 6 | V2Index << 4 | ZMask;
- assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
-
- // Insert the V2 element into the desired position.
- return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
- DAG.getConstant(InsertPSMask, MVT::i8));
- }
+ // Use INSERTPS if we can complete the shuffle efficiently.
+ if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG))
+ return V;
}
// Otherwise fall back to a SHUFPS lowering strategy.
; CHECK-LABEL: test19:
; CHECK: # BB#0:
; CHECK-NEXT: xorps %xmm2, %xmm2
-; CHECK-NEXT: xorps %xmm3, %xmm3
-; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[0,3]
-; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
-; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0]
-; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,2]
-; CHECK-NEXT: orps %xmm3, %xmm2
+; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,3]
+; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; CHECK-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,2]
+; CHECK-NEXT: orps %xmm1, %xmm2
; CHECK-NEXT: movaps %xmm2, %xmm0
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>
; AVX2-LABEL: test5
; AVX2: vmaskmovpd
; AVX2: vblendvpd
-; AVX2: vmaskmovpd
+; AVX2: vmaskmovpd
; AVX2: vblendvpd
define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> %dst) {
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
}
; AVX2-LABEL: test14
-; AVX2: vshufps $-24
+; AVX2: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX2: vmaskmovps
define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
}
-declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
+declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>)
declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
-declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
-declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
+declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
+declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)
declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>)
declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
}
define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-; SSE-LABEL: combine_bitwise_ops_test3c:
-; SSE: # BB#0:
-; SSE-NEXT: xorps %xmm1, %xmm0
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
-; SSE-NEXT: retq
+; SSE2-LABEL: combine_bitwise_ops_test3c:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm0
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test3c:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm0
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test3c:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorps %xmm1, %xmm0
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; SSE41-NEXT: retq
;
; AVX-LABEL: combine_bitwise_ops_test3c:
; AVX: # BB#0:
; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>