return SDValue();
}
+/// Try to convert a wide shuffle of concatenated vectors into 2 narrow shuffles
+/// followed by concatenation. Narrow vector ops may have better performance
+/// than wide ops, and this can unlock further narrowing of other vector ops.
+/// Targets can invert this transform later if it is not profitable.
+static SDValue foldShuffleOfConcatUndefs(ShuffleVectorSDNode *Shuf,
+ SelectionDAG &DAG) {
+ SDValue N0 = Shuf->getOperand(0), N1 = Shuf->getOperand(1);
+ if (N0.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
+ N1.getOpcode() != ISD::CONCAT_VECTORS || N1.getNumOperands() != 2 ||
+ !N0.getOperand(1).isUndef() || !N1.getOperand(1).isUndef())
+ return SDValue();
+
+ // Split the wide shuffle mask into halves. Any mask element that is accessing
+ // operand 1 is offset down to account for narrowing of the vectors.
+ ArrayRef<int> Mask = Shuf->getMask();
+ EVT VT = Shuf->getValueType(0);
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned HalfNumElts = NumElts / 2;
+ SmallVector<int, 16> Mask0(HalfNumElts, -1);
+ SmallVector<int, 16> Mask1(HalfNumElts, -1);
+ for (unsigned i = 0; i != NumElts; ++i) {
+ if (Mask[i] == -1)
+ continue;
+ int M = Mask[i] < (int)NumElts ? Mask[i] : Mask[i] - (int)HalfNumElts;
+ if (i < HalfNumElts)
+ Mask0[i] = M;
+ else
+ Mask1[i - HalfNumElts] = M;
+ }
+
+ // Ask the target if this is a valid transform.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
+ HalfNumElts);
+ if (!TLI.isShuffleMaskLegal(Mask0, HalfVT) ||
+ !TLI.isShuffleMaskLegal(Mask1, HalfVT))
+ return SDValue();
+
+ // shuffle (concat X, undef), (concat Y, undef), Mask -->
+ // concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1)
+ SDValue X = N0.getOperand(0), Y = N1.getOperand(0);
+ SDLoc DL(Shuf);
+ SDValue Shuf0 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask0);
+ SDValue Shuf1 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask1);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Shuf0, Shuf1);
+}
+
// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
// or turn a shuffle of a single concat into simpler shuffle then concat.
static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask);
}
+ if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG))
+ return V;
+
return SDValue();
}
; CHECK-LABEL: vuzp_lower_shufflemask_undef:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldr d17, [r1]
-; CHECK-NEXT: vldr d16, [r0]
-; CHECK-NEXT: vorr q9, q8, q8
-; CHECK-NEXT: vuzp.16 q8, q9
-; CHECK-NEXT: vmov r0, r1, d18
-; CHECK-NEXT: vmov r2, r3, d19
+; CHECK-NEXT: vldr d18, [r0]
+; CHECK-NEXT: vuzp.16 d18, d17
+; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
entry:
%tmp1 = load <4 x i16>, <4 x i16>* %A
define <4 x i32> @vuzp_lower_shufflemask_zeroed(<2 x i32>* %A, <2 x i32>* %B) {
; CHECK-LABEL: vuzp_lower_shufflemask_zeroed:
; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vldr d18, [r0]
+; CHECK-NEXT: vorr d19, d18, d18
; CHECK-NEXT: vldr d17, [r1]
-; CHECK-NEXT: vldr d16, [r0]
-; CHECK-NEXT: vdup.32 q9, d16[0]
-; CHECK-NEXT: vuzp.32 q8, q9
-; CHECK-NEXT: vext.32 q8, q9, q9, #2
-; CHECK-NEXT: vmov r0, r1, d16
+; CHECK-NEXT: vtrn.32 d19, d17
+; CHECK-NEXT: vdup.32 d16, d18[0]
; CHECK-NEXT: vmov r2, r3, d17
+; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
entry:
%tmp1 = load <2 x i32>, <2 x i32>* %A
define void @vuzp_rev_shufflemask_vtrn(<2 x i32>* %A, <2 x i32>* %B, <4 x i32>* %C) {
; CHECK-LABEL: vuzp_rev_shufflemask_vtrn:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldr d17, [r1]
-; CHECK-NEXT: vldr d16, [r0]
-; CHECK-NEXT: vrev64.32 q9, q8
-; CHECK-NEXT: vuzp.32 q8, q9
-; CHECK-NEXT: vst1.64 {d18, d19}, [r2]
+; CHECK-NEXT: vldr d16, [r1]
+; CHECK-NEXT: vldr d17, [r0]
+; CHECK-NEXT: vtrn.32 d17, d16
+; CHECK-NEXT: vst1.64 {d16, d17}, [r2]
; CHECK-NEXT: mov pc, lr
entry:
%tmp1 = load <2 x i32>, <2 x i32>* %A
; CHECK-LABEL: vzip_lower_shufflemask_undef:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldr d17, [r1]
-; CHECK-NEXT: vldr d16, [r0]
-; CHECK-NEXT: vzip.16 d16, d17
+; CHECK-NEXT: vldr d18, [r0]
+; CHECK-NEXT: vzip.16 d18, d17
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov pc, lr
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm0[1]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%even0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
%even1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
; SSE42-NEXT: movdqa %xmm2, (%rdi)
; SSE42-NEXT: retq
;
-; AVX1-LABEL: v7i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2]
-; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,0,3]
-; AVX1-NEXT: vmovss %xmm1, 24(%rdi)
-; AVX1-NEXT: vmovlps %xmm0, 16(%rdi)
-; AVX1-NEXT: vmovaps %xmm2, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: v7i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,6,3,6,1,7,4,u>
-; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vmovss %xmm1, 24(%rdi)
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovlps %xmm1, 16(%rdi)
-; AVX2-NEXT: vmovaps %xmm0, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX-LABEL: v7i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2]
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX-NEXT: vmovss %xmm1, 24(%rdi)
+; AVX-NEXT: vmovlps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm2, (%rdi)
+; AVX-NEXT: retq
;
; XOP-LABEL: v7i32:
; XOP: # %bb.0:
; XOP-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3]
; XOP-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2]
-; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; XOP-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,0,3]
+; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; XOP-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; XOP-NEXT: vmovss %xmm1, 24(%rdi)
; XOP-NEXT: vmovlps %xmm0, 16(%rdi)
; XOP-NEXT: vmovaps %xmm2, (%rdi)
; SSE2-NEXT: pandn %xmm2, %xmm3
; SSE2-NEXT: por %xmm4, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535]
-; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,3,4,5,6,7]
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: movq %xmm2, 16(%rdi)
; SSE2-NEXT: movdqa %xmm3, (%rdi)
; SSE2-NEXT: retq
; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
-; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3,4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,3]
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13]
; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]