return true;
}
+/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
+///
+/// The masks must be exactly the same width.
+///
+/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
+/// value in ExpectedMask is always accepted. Otherwise the indices must match.
+///
+/// SM_SentinelZero is accepted as a valid negative index but must match in both.
+static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
+ ArrayRef<int> ExpectedMask) {
+ int Size = Mask.size();
+ if (Size != ExpectedMask.size())
+ return false;
+
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] == SM_SentinelUndef)
+ continue;
+ else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
+ return false;
+ else if (Mask[i] != ExpectedMask[i])
+ return false;
+
+ return true;
+}
+
/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
///
/// This helper function produces an 8-bit shuffle immediate corresponding to
// Attempt to match a combined shuffle mask against supported unary shuffle
// instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
-// TODO: Investigate using isShuffleEquivalent() instead of Mask.equals().
static bool matchUnaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
unsigned &Shuffle, MVT &ShuffleVT) {
bool FloatDomain = SrcVT.isFloatingPoint();
// Match a 128-bit integer vector against a VZEXT_MOVL (MOVQ) instruction.
- if (!FloatDomain && SrcVT.is128BitVector() && Mask.size() == 2 &&
- Mask[0] == 0 && Mask[1] < 0) {
+ if (!FloatDomain && SrcVT.is128BitVector() &&
+ isTargetShuffleEquivalent(Mask, {0, SM_SentinelZero})) {
Shuffle = X86ISD::VZEXT_MOVL;
ShuffleVT = MVT::v2i64;
return true;
// instructions are no slower than UNPCKLPD but has the option to
// fold the input operand into even an unaligned memory load.
if (SrcVT.is128BitVector() && Subtarget.hasSSE3()) {
- if (Mask.equals({0, 0})) {
+ if (isTargetShuffleEquivalent(Mask, {0, 0})) {
Shuffle = X86ISD::MOVDDUP;
ShuffleVT = MVT::v2f64;
return true;
}
- if (Mask.equals({0, 0, 2, 2})) {
+ if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
Shuffle = X86ISD::MOVSLDUP;
ShuffleVT = MVT::v4f32;
return true;
}
- if (Mask.equals({1, 1, 3, 3})) {
+ if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
Shuffle = X86ISD::MOVSHDUP;
ShuffleVT = MVT::v4f32;
return true;
if (SrcVT.is256BitVector()) {
assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
- if (Mask.equals({0, 0, 2, 2})) {
+ if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
Shuffle = X86ISD::MOVDDUP;
ShuffleVT = MVT::v4f64;
return true;
}
- if (Mask.equals({0, 0, 2, 2, 4, 4, 6, 6})) {
+ if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
Shuffle = X86ISD::MOVSLDUP;
ShuffleVT = MVT::v8f32;
return true;
}
- if (Mask.equals({1, 1, 3, 3, 5, 5, 7, 7})) {
+ if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
Shuffle = X86ISD::MOVSHDUP;
ShuffleVT = MVT::v8f32;
return true;
if (SrcVT.is512BitVector()) {
assert(Subtarget.hasAVX512() &&
"AVX512 required for 512-bit vector shuffles");
- if (Mask.equals({0, 0, 2, 2, 4, 4, 6, 6})) {
+ if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
Shuffle = X86ISD::MOVDDUP;
ShuffleVT = MVT::v8f64;
return true;
}
- if (Mask.equals({0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
+ if (isTargetShuffleEquivalent(
+ Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
Shuffle = X86ISD::MOVSLDUP;
ShuffleVT = MVT::v16f32;
return true;
}
- if (Mask.equals({1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
+ if (isTargetShuffleEquivalent(
+ Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
Shuffle = X86ISD::MOVSHDUP;
ShuffleVT = MVT::v16f32;
return true;
// Attempt to match a combined unary shuffle mask against supported binary
// shuffle instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
-// TODO: Investigate using isShuffleEquivalent() instead of Mask.equals().
static bool matchBinaryVectorShuffle(MVT SrcVT, ArrayRef<int> Mask,
unsigned &Shuffle, MVT &ShuffleVT) {
bool FloatDomain = SrcVT.isFloatingPoint();
if (SrcVT.is128BitVector()) {
- if (Mask.equals({0, 0}) && FloatDomain) {
+ if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
Shuffle = X86ISD::MOVLHPS;
ShuffleVT = MVT::v4f32;
return true;
}
- if (Mask.equals({1, 1}) && FloatDomain) {
+ if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
Shuffle = X86ISD::MOVHLPS;
ShuffleVT = MVT::v4f32;
return true;
}
- if (Mask.equals({0, 0, 1, 1}) && FloatDomain) {
+ if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1}) && FloatDomain) {
Shuffle = X86ISD::UNPCKL;
ShuffleVT = MVT::v4f32;
return true;
}
- if (Mask.equals({2, 2, 3, 3}) && FloatDomain) {
+ if (isTargetShuffleEquivalent(Mask, {2, 2, 3, 3}) && FloatDomain) {
Shuffle = X86ISD::UNPCKH;
ShuffleVT = MVT::v4f32;
return true;
}
- if (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
- Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) {
+ if (isTargetShuffleEquivalent(Mask, {0, 0, 1, 1, 2, 2, 3, 3}) ||
+ isTargetShuffleEquivalent(
+ Mask, {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7})) {
Shuffle = X86ISD::UNPCKL;
ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
return true;
}
- if (Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) ||
- Mask.equals(
- {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15})) {
+ if (isTargetShuffleEquivalent(Mask, {4, 4, 5, 5, 6, 6, 7, 7}) ||
+ isTargetShuffleEquivalent(Mask, {8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13,
+ 13, 14, 14, 15, 15})) {
Shuffle = X86ISD::UNPCKH;
ShuffleVT = Mask.size() == 8 ? MVT::v8i16 : MVT::v16i8;
return true;
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX-NEXT: vmovsldup {{.*#+}} xmm1 = xmm2[0,0,2,2]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX-NEXT: retq
%1 = extractelement <4 x float> %A, i32 1
; ALL: # BB#0:
; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; ALL-NEXT: retq
- %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 1, i32 1, i32 3, i32 3>)
+ %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 undef, i32 1, i32 3, i32 3>)
ret <4 x float> %1
}
; ALL: # BB#0:
; ALL-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; ALL-NEXT: retq
- %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>)
+ %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 undef>)
ret <4 x float> %1
}
; ALL-LABEL: combine_vpermilvar_8f32_identity:
; ALL: # BB#0:
; ALL-NEXT: retq
- %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 undef>)
%2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)
ret <8 x float> %2
}
; ALL: # BB#0:
; ALL-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; ALL-NEXT: retq
- %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>)
+ %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 5, i32 7, i32 7>)
ret <8 x float> %1
}
; CHECK: # BB#0:
; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
; CHECK-NEXT: retq
- %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 -1)
+ %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 undef, i64 undef>, <8 x double> %x0, <8 x double> %x1, i8 -1)
ret <8 x double> %res0
}
define <8 x double> @combine_vpermt2var_8f64_movddup_load(<8 x double> *%p0, <8 x double> %x1) {
; CHECK-LABEL: combine_vpermt2var_8i64_identity:
; CHECK: # BB#0:
; CHECK-NEXT: retq
- %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
- %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
+ %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
+ %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
ret <8 x i64> %res1
}
define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; CHECK-NEXT: retq
- %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
ret <16 x float> %res0
}
define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; CHECK-NEXT: retq
%x0 = load <16 x float>, <16 x float> *%p0
- %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
+ %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
ret <16 x float> %res0
}
; CHECK-LABEL: combine_vpermt2var_16i32_identity:
; CHECK: # BB#0:
; CHECK-NEXT: retq
- %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
- %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
+ %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
+ %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
ret <16 x i32> %res1
}
define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x i32> %x1, i16 %m) {
; CHECK: # BB#0:
; CHECK-NEXT: retq
%select = bitcast <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <64 x i8>
- %mask = bitcast <16 x i32> <i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051> to <64 x i8>
+ %mask = bitcast <16 x i32> <i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 undef, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051> to <64 x i8>
%res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %mask, <64 x i8> %select, i64 -1)
%res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %res0, <64 x i8> %mask, <64 x i8> %select, i64 -1)
ret <64 x i8> %res1
define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_unary_unpcklbw:
; SSE: # BB#0:
-; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,u,u,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_unary_unpcklbw:
; AVX: # BB#0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,u,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 undef, i8 undef, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 5, i8 5, i8 6, i8 6, i8 7, i8 7>)
ret <16 x i8> %1
define <16 x i8> @combine_pshufb_as_unary_unpckhwd(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_unary_unpckhwd:
; SSE: # BB#0:
-; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,10,11,10,11,12,13,12,13,14,15,u,u]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_unary_unpckhwd:
; AVX: # BB#0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,10,11,10,11,12,13,12,13,14,15,u,u]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 8, i8 9, i8 10, i8 11, i8 10, i8 11, i8 12, i8 13, i8 12, i8 13, i8 14, i8 15, i8 undef, i8 undef>)
ret <16 x i8> %1
; CHECK: # BB#0:
; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; CHECK-NEXT: retq
- %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 8, i8 24, i8 9, i8 25, i8 10, i8 26, i8 11, i8 27, i8 12, i8 28, i8 13, i8 29, i8 14, i8 30, i8 15, i8 31>)
+ %res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 8, i8 undef, i8 9, i8 25, i8 10, i8 26, i8 11, i8 27, i8 12, i8 28, i8 13, i8 29, i8 14, i8 30, i8 15, i8 31>)
ret <16 x i8> %res0
}