[AVX-512] Add support for lowering shuffles to VALIGND/VALIGNQ

author Craig Topper <craig.topper@gmail.com>

Sat, 12 Nov 2016 05:05:27 +0000 (05:05 +0000)

committer Craig Topper <craig.topper@gmail.com>

Sat, 12 Nov 2016 05:05:27 +0000 (05:05 +0000)
author Craig Topper <craig.topper@gmail.com>
Sat, 12 Nov 2016 05:05:27 +0000 (05:05 +0000)
committer Craig Topper <craig.topper@gmail.com>
Sat, 12 Nov 2016 05:05:27 +0000 (05:05 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 5f4a29d9812aca7d5434d38df3a8b170e229e910..cd1ce17811212ea30ddcac0a6c3be9ba44f3a173 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -7788,34 +7788,12 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
    return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
  }
  
-/// \brief Try to lower a vector shuffle as a byte rotation.
-///
-/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
-/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
-/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
-/// try to generically lower a vector shuffle through such an pattern. It
-/// does not check for the profitability of lowering either as PALIGNR or
-/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
-/// This matches shuffle vectors that look like:
-///
-///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
+/// \brief Try to lower a vector shuffle as a rotation.
  ///
-/// Essentially it concatenates V1 and V2, shifts right by some number of
-/// elements, and takes the low elements as the result. Note that while this is
-/// specified as a *right shift* because x86 is little-endian, it is a *left
-/// rotate* of the vector lanes.
-static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
-                                          ArrayRef<int> Mask) {
-  // Don't accept any shuffles with zero elements.
-  if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
-    return -1;
-
-  // PALIGNR works on 128-bit lanes.
-  SmallVector<int, 16> RepeatedMask;
-  if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
-    return -1;
-
-  int NumElts = RepeatedMask.size();
+/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
+static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
+                                      ArrayRef<int> Mask) {
+  int NumElts = Mask.size();
  
    // We need to detect various ways of spelling a rotation:
    //   [11, 12, 13, 14, 15,  0,  1,  2]
@@ -7827,7 +7805,7 @@ static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
    int Rotation = 0;
    SDValue Lo, Hi;
    for (int i = 0; i < NumElts; ++i) {
-    int M = RepeatedMask[i];
+    int M = Mask[i];
      assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
             "Unexpected mask index.");
      if (M < 0)
@@ -7879,8 +7857,43 @@ static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
    V1 = Lo;
    V2 = Hi;
  
+  return Rotation;
+}
+
+/// \brief Try to lower a vector shuffle as a byte rotation.
+///
+/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
+/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
+/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
+/// try to generically lower a vector shuffle through such an pattern. It
+/// does not check for the profitability of lowering either as PALIGNR or
+/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
+/// This matches shuffle vectors that look like:
+///
+///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
+///
+/// Essentially it concatenates V1 and V2, shifts right by some number of
+/// elements, and takes the low elements as the result. Note that while this is
+/// specified as a *right shift* because x86 is little-endian, it is a *left
+/// rotate* of the vector lanes.
+static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
+                                          ArrayRef<int> Mask) {
+  // Don't accept any shuffles with zero elements.
+  if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
+    return -1;
+
+  // PALIGNR works on 128-bit lanes.
+  SmallVector<int, 16> RepeatedMask;
+  if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
+    return -1;
+
+  int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
+  if (Rotation <= 0)
+    return -1;
+
    // PALIGNR rotates bytes, so we need to scale the
    // rotation based on how many bytes are in the vector lane.
+  int NumElts = RepeatedMask.size();
    int Scale = 16 / NumElts;
    return Rotation * Scale;
  }
@@ -7931,6 +7944,37 @@ static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
                          DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
  }
  
+/// \brief Try to lower a vector shuffle as a dword/qword rotation.
+///
+/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
+/// rotation of the concatenation of two vectors; This routine will
+/// try to generically lower a vector shuffle through such an pattern.
+///
+/// Essentially it concatenates V1 and V2, shifts right by some number of
+/// elements, and takes the low elements as the result. Note that while this is
+/// specified as a *right shift* because x86 is little-endian, it is a *left
+/// rotate* of the vector lanes.
+static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
+                                          SDValue V1, SDValue V2,
+                                          ArrayRef<int> Mask,
+                                          const X86Subtarget &Subtarget,
+                                          SelectionDAG &DAG) {
+  assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
+         "Only 32-bit and 64-bit elements are supported!");
+
+  // 128/256-bit vectors are only supported with VLX.
+  assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
+         && "VLX required for 128/256-bit vectors");
+
+  SDValue Lo = V1, Hi = V2;
+  int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
+  if (Rotation <= 0)
+    return SDValue();
+
+  return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
+                     DAG.getConstant(Rotation, DL, MVT::i8));
+}
+
  /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
  ///
  /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
@@ -11505,6 +11549,13 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                  Zeroable, Subtarget, DAG))
      return Shift;
  
+  // If we have VLX support, we can use VALIGN.
+  if (Subtarget.hasVLX())
+    if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
+                                                    Mask, Subtarget, DAG))
+      return Rotate;
+
+  // Try to use PALIGNR.
    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
                                                        Mask, Subtarget, DAG))
      return Rotate;
@@ -11666,6 +11717,12 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                  Zeroable, Subtarget, DAG))
      return Shift;
  
+  // If we have VLX support, we can use VALIGN.
+  if (Subtarget.hasVLX())
+    if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
+                                                    Mask, Subtarget, DAG))
+      return Rotate;
+
    // Try to use byte rotation instructions.
    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
            DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
@@ -12094,6 +12151,12 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                  Zeroable, Subtarget, DAG))
      return Shift;
  
+  // Try to use VALIGN.
+  if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
+                                                  Mask, Subtarget, DAG))
+    return Rotate;
+
+  // Try to use PALIGNR.
    if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
                                                        Mask, Subtarget, DAG))
      return Rotate;
@@ -12143,6 +12206,11 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                  Zeroable, Subtarget, DAG))
      return Shift;
  
+  // Try to use VALIGN.
+  if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
+                                                  Mask, Subtarget, DAG))
+    return Rotate;
+
    // Try to use byte rotation instructions.
    if (Subtarget.hasBWI())
      if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll

index 4e71c5936e9a7952b7a0df946d0b80830ded5396..1541afbc3b51d7d8dc600ca8e1e9d38c0cee3302 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1462,8 +1462,7 @@ define <4 x i64> @shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b) {
  ;
  ; AVX512VL-LABEL: shuffle_v4i64_1234:
  ; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0]
+; AVX512VL-NEXT:    valignq {{.*#+}} ymm0 = ymm0[1,2,3],ymm1[0]
  ; AVX512VL-NEXT:    retq
    %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
    ret <4 x i64> %shuffle
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll

index fd21e4359e73085c697d05af44aa2b3dfdcf3a37..a7b0d36ca931d80c08bbf42ae95048da508d5238 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -2562,8 +2562,7 @@ define <8 x i32> @shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b) {
  ;
  ; AVX512VL-LABEL: shuffle_v8i32_12345678:
  ; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vmovdqa32 {{.*#+}} ymm2 = [1,2,3,4,5,6,7,8]
-; AVX512VL-NEXT:    vpermt2d %ymm1, %ymm2, %ymm0
+; AVX512VL-NEXT:    valignd {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7],ymm1[0]
  ; AVX512VL-NEXT:    retq
    %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
    ret <8 x i32> %shuffle
@@ -2585,8 +2584,7 @@ define <8 x i32> @shuffle_v8i32_12345670(<8 x i32> %a) {
  ;
  ; AVX512VL-LABEL: shuffle_v8i32_12345670:
  ; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vmovdqa32 {{.*#+}} ymm1 = [1,2,3,4,5,6,7,0]
-; AVX512VL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    valignd {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,0]
  ; AVX512VL-NEXT:    retq
    %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0>
    ret <8 x i32> %shuffle
diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll

index c84409fe4d4c7ba6048253dcf84eb4104c74c50c..de6075420aae94d057b2ef3253b421bf75676104 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -342,8 +342,7 @@ define <16 x i32> @shuffle_v16i32_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_23_z
  define <16 x i32> @shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i32> %a, <16 x i32> %b) {
  ; ALL-LABEL: shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
  ; ALL:       # BB#0:
-; ALL-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]
-; ALL-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0]
  ; ALL-NEXT:    retq
    %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
    ret <16 x i32> %shuffle
@@ -352,8 +351,7 @@ define <16 x i32> @shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_1
  define <16 x i32> @shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i32> %a) {
  ; ALL-LABEL: shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
  ; ALL:       # BB#0:
-; ALL-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
-; ALL-NEXT:    vpermd %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
  ; ALL-NEXT:    retq
    %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0>
    ret <16 x i32> %shuffle
diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll

index 475ed2d2e4f892a1ac7263100c5541697da05d3c..0f163ba2188df08d95cb45ff4e7793b4aeb060bd 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -2280,14 +2280,12 @@ define <8 x i64> @shuffle_v8i64_12345678(<8 x i64> %a, <8 x i64> %b) {
  ;
  ; AVX512F-LABEL: shuffle_v8i64_12345678:
  ; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,8]
-; AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm1[0]
  ; AVX512F-NEXT:    retq
  ;
  ; AVX512F-32-LABEL: shuffle_v8i64_12345678:
  ; AVX512F-32:       # BB#0:
-; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,5,0,6,0,7,0,8,0]
-; AVX512F-32-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
+; AVX512F-32-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm1[0]
  ; AVX512F-32-NEXT:    retl
    %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
    ret <8 x i64> %shuffle
@@ -2297,14 +2295,12 @@ define <8 x i64> @shuffle_v8i64_12345670(<8 x i64> %a) {
  ;
  ; AVX512F-LABEL: shuffle_v8i64_12345670:
  ; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,2,3,4,5,6,7,0]
-; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0]
  ; AVX512F-NEXT:    retq
  ;
  ; AVX512F-32-LABEL: shuffle_v8i64_12345670:
  ; AVX512F-32:       # BB#0:
-; AVX512F-32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0]
-; AVX512F-32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0]
  ; AVX512F-32-NEXT:    retl
    %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0>
    ret <8 x i64> %shuffle
author	Craig Topper <craig.topper@gmail.com>
	Sat, 12 Nov 2016 05:05:27 +0000 (05:05 +0000)
committer	Craig Topper <craig.topper@gmail.com>
	Sat, 12 Nov 2016 05:05:27 +0000 (05:05 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-256-v4.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-256-v8.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-512-v16.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-512-v8.ll		patch \| blob \| history