bool IsUnary;
if (!isTargetShuffle(N.getOpcode()))
return false;
- if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Ops,
- Mask, IsUnary))
+
+ MVT VT = N.getSimpleValueType();
+ if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
return false;
SDValue V1 = Ops[0];
}
}
+ assert(VT.getVectorNumElements() == Mask.size() &&
+ "Different mask size from vector size!");
return true;
}
+// Attempt to decode ops that could be represented as a shuffle mask.
+// The decoded shuffle mask may contain a different number of elements to the
+// destination value type.
+static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<SDValue> &Ops) {
+ Mask.clear();
+ Ops.clear();
+
+ MVT VT = N.getSimpleValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+
+ unsigned Opcode = N.getOpcode();
+ switch (Opcode) {
+ case X86ISD::VSHLI:
+ case X86ISD::VSRLI: {
+ uint64_t ShiftVal = N.getConstantOperandVal(1);
+ // Out of range bit shifts are guaranteed to be zero.
+ if (VT.getScalarSizeInBits() <= ShiftVal) {
+ Mask.append(NumElts, SM_SentinelZero);
+ return true;
+ }
+
+ // We can only decode 'whole byte' bit shifts as shuffles.
+ if ((ShiftVal % 8) != 0)
+ break;
+
+ uint64_t ByteShift = ShiftVal / 8;
+ unsigned NumBytes = VT.getSizeInBits() / 8;
+ unsigned NumBytesPerElt = VT.getScalarSizeInBits() / 8;
+ Ops.push_back(N.getOperand(0));
+
+ // Clear mask to all zeros and insert the shifted byte indices.
+ Mask.append(NumBytes, SM_SentinelZero);
+
+ if (X86ISD::VSHLI == Opcode) {
+ for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
+ for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
+ Mask[i + j] = i + j - ByteShift;
+ } else {
+ for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
+ for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
+ Mask[i + j - ByteShift] = i + j;
+ }
+ return true;
+ }
+ }
+
+ return false;
+}
+
/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
/// remaining input indices in case we now have a unary shuffle and adjust the
SmallVectorImpl<int> &Mask) {
SmallVector<SDValue, 2> Ops;
if (!setTargetShuffleZeroElements(Op, Mask, Ops))
- return false;
+ if (!getFauxShuffleMask(Op, Mask, Ops))
+ return false;
int NumElts = Mask.size();
bool Op0InUse = any_of(Mask, [NumElts](int Idx) {
Ops.push_back(Input1);
}
- assert(VT.getVectorNumElements() == OpMask.size() &&
- "Different mask size from vector size!");
assert(((RootMask.size() > OpMask.size() &&
RootMask.size() % OpMask.size() == 0) ||
(OpMask.size() > RootMask.size() &&
;
; SSSE3-LABEL: sext_16i8_to_4i64:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: psrad $24, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT: psrld $16, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: movdqa %xmm0, %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: psrad $24, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,u,u,u,3,u,u,u],zero,xmm1[u,u,u],zero
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: psrad $31, %xmm2
; SSSE3-NEXT: psrad $24, %xmm1
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_4i64:
;
; SSSE3-LABEL: sext_16i8_to_8i64:
; SSSE3: # BB#0: # %entry
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <u,u,u,2,u,u,u,3,u,u,u,255,u,u,u,255>
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: psrad $31, %xmm2
+; SSSE3-NEXT: pshufb %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: psrad $31, %xmm0
+; SSSE3-NEXT: psrad $24, %xmm1
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: psrad $31, %xmm4
; SSSE3-NEXT: psrad $24, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
-; SSSE3-NEXT: psrld $16, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSSE3-NEXT: pshufb %xmm2, %xmm3
+; SSSE3-NEXT: movdqa %xmm3, %xmm2
; SSSE3-NEXT: psrad $31, %xmm2
-; SSSE3-NEXT: psrad $24, %xmm1
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: psrad $24, %xmm3
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; SSSE3-NEXT: movdqa %xmm2, %xmm4
; SSSE3-NEXT: psrad $31, %xmm4
; SSSE3-NEXT: psrad $24, %xmm2
; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSSE3-NEXT: psrld $16, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: psrad $24, %xmm3
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_8i64:
define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) {
; X32-LABEL: combine_psrlw_pshufb:
; X32: # BB#0:
-; X32-NEXT: vpsrlw $8, %ymm0, %ymm0
-; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
+; X32-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[1],zero,ymm0[3],zero,ymm0[5],zero,ymm0[7],zero,ymm0[9],zero,ymm0[11],zero,ymm0[13],zero,ymm0[15],zero,ymm0[17],zero,ymm0[19],zero,ymm0[21],zero,ymm0[23],zero,ymm0[25],zero,ymm0[27],zero,ymm0[29],zero,ymm0[31]
; X32-NEXT: retl
;
; X64-LABEL: combine_psrlw_pshufb:
; X64: # BB#0:
-; X64-NEXT: vpsrlw $8, %ymm0, %ymm0
-; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
+; X64-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[1],zero,ymm0[3],zero,ymm0[5],zero,ymm0[7],zero,ymm0[9],zero,ymm0[11],zero,ymm0[13],zero,ymm0[15],zero,ymm0[17],zero,ymm0[19],zero,ymm0[21],zero,ymm0[23],zero,ymm0[25],zero,ymm0[27],zero,ymm0[29],zero,ymm0[31]
; X64-NEXT: retq
%1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%2 = bitcast <16 x i16> %1 to <32 x i8>
define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) {
; X32-LABEL: combine_pslld_pshufb:
; X32: # BB#0:
-; X32-NEXT: vpslld $24, %ymm0, %ymm0
-; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
+; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[4],zero,zero,zero,ymm0[8],zero,zero,zero,ymm0[12],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[20],zero,zero,zero,ymm0[24],zero,zero,zero,ymm0[28],zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: combine_pslld_pshufb:
; X64: # BB#0:
-; X64-NEXT: vpslld $24, %ymm0, %ymm0
-; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
+; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0],zero,zero,zero,ymm0[4],zero,zero,zero,ymm0[8],zero,zero,zero,ymm0[12],zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[20],zero,zero,zero,ymm0[24],zero,zero,zero,ymm0[28],zero,zero,zero
; X64-NEXT: retq
%1 = shl <8 x i32> %a0, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
%2 = bitcast <8 x i32> %1 to <32 x i8>
define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) {
; X32-LABEL: combine_psrlq_pshufb:
; X32: # BB#0:
-; X32-NEXT: vpsrlq $32, %ymm0, %ymm0
-; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,31,30,29,28,27,26,25,24,23]
+; X32-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero
; X32-NEXT: retl
;
; X64-LABEL: combine_psrlq_pshufb:
; X64: # BB#0:
-; X64-NEXT: vpsrlq $32, %ymm0, %ymm0
-; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,31,30,29,28,27,26,25,24,23]
+; X64-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero
; X64-NEXT: retq
%1 = lshr <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
%2 = bitcast <4 x i64> %1 to <32 x i8>
define <16 x i8> @combine_psrlw_pshufb(<8 x i16> %a0) {
; SSE-LABEL: combine_psrlw_pshufb:
; SSE: # BB#0:
-; SSE-NEXT: psrlw $8, %xmm0
-; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: combine_psrlw_pshufb:
; AVX: # BB#0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: retq
%1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%2 = bitcast <8 x i16> %1 to <16 x i8>
define <16 x i8> @combine_pslld_pshufb(<4 x i32> %a0) {
; SSE-LABEL: combine_pslld_pshufb:
; SSE: # BB#0:
-; SSE-NEXT: pslld $8, %xmm0
-; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,1,0],zero,xmm0[6,5,4],zero,xmm0[10,9,8],zero,xmm0[14,13,12],zero
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pslld_pshufb:
; AVX: # BB#0:
-; AVX-NEXT: vpslld $8, %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,1,0],zero,xmm0[6,5,4],zero,xmm0[10,9,8],zero,xmm0[14,13,12],zero
; AVX-NEXT: retq
%1 = shl <4 x i32> %a0, <i32 8, i32 8, i32 8, i32 8>
%2 = bitcast <4 x i32> %1 to <16 x i8>
define <16 x i8> @combine_psrlq_pshufb(<2 x i64> %a0) {
; SSE-LABEL: combine_psrlq_pshufb:
; SSE: # BB#0:
-; SSE-NEXT: psrlq $48, %xmm0
-; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[7,6],zero,zero,zero,zero,zero,zero,xmm0[15,14]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_psrlq_pshufb:
; AVX: # BB#0:
-; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[7,6],zero,zero,zero,zero,zero,zero,xmm0[15,14]
; AVX-NEXT: retq
%1 = lshr <2 x i64> %a0, <i64 48, i64 48>
%2 = bitcast <2 x i64> %1 to <16 x i8>