[X86] Prefer VPERMQ over VPERM2F128 for any unary shuffle, not just the ones that...

author Craig Topper <craig.topper@intel.com>

Fri, 15 Sep 2017 18:11:13 +0000 (18:11 +0000)

committer Craig Topper <craig.topper@intel.com>

Fri, 15 Sep 2017 18:11:13 +0000 (18:11 +0000)
author Craig Topper <craig.topper@intel.com>
Fri, 15 Sep 2017 18:11:13 +0000 (18:11 +0000)
committer Craig Topper <craig.topper@intel.com>
Fri, 15 Sep 2017 18:11:13 +0000 (18:11 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index ff00d9e5e0086ae3829a0c6ad9689aa7c6dc2f6e..4df7621bee2bb0ac0afed266271364fef49a6fed 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -12151,6 +12151,10 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
                                          const APInt &Zeroable,
                                          const X86Subtarget &Subtarget,
                                          SelectionDAG &DAG) {
+  // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
+  if (Subtarget.hasAVX2() && V2.isUndef())
+    return SDValue();
+
    SmallVector<int, 4> WidenedMask;
    if (!canWidenShuffleElements(Mask, WidenedMask))
      return SDValue();
@@ -12174,9 +12178,6 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
      // subvector.
      bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
      if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
-      // With AVX2, use VPERMQ/VPERMPD to allow memory folding.
-      if (Subtarget.hasAVX2() && V2.isUndef())
-        return SDValue();
  
        // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
        // this will likely become vinsertf128 which can't fold a 256-bit memop.
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll

index dfb1b78e2883fd30adef9af2a3da30e61c4d53c3..fa2841dfcb1051ca2eab2f1773ef2a3c50ea0862 100644 (file)
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -228,10 +228,15 @@ entry:
  }
  
  define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: shuffle_v8f32_uu67uu67:
-; ALL:       # BB#0: # %entry
-; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; ALL-NEXT:    retq
+; AVX1-LABEL: shuffle_v8f32_uu67uu67:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_uu67uu67:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX2-NEXT:    retq
  entry:
    %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
    ret <8 x float> %shuffle
@@ -258,10 +263,15 @@ entry:
  }
  
  define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: shuffle_v8f32_uu674567:
-; ALL:       # BB#0: # %entry
-; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; ALL-NEXT:    retq
+; AVX1-LABEL: shuffle_v8f32_uu674567:
+; AVX1:       # BB#0: # %entry
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v8f32_uu674567:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX2-NEXT:    retq
  entry:
    %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
    ret <8 x float> %shuffle
diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll

index 92de1cc4730f7a329d97587ea0b7d2cf30fcdf66..b7c54c1b743e6f3eec9f6d0fe7a3465bfac26348 100644 (file)
--- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -2115,10 +2115,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64
  ; CHECK:       # BB#0:
  ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
  ; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
  ; CHECK-NEXT:    movb $2, %al
  ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
  ; CHECK-NEXT:    retq
    %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
    %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> %vec2
@@ -2130,10 +2130,9 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec) {
  ; CHECK:       # BB#0:
  ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
  ; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
  ; CHECK-NEXT:    movb $2, %al
  ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
  ; CHECK-NEXT:    retq
    %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
    %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
@@ -3849,7 +3848,7 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
  ; CHECK:       # BB#0:
  ; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
  ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
  ; CHECK-NEXT:    retq
    %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
    ret <4 x double> %res
@@ -3859,10 +3858,10 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %v
  ; CHECK:       # BB#0:
  ; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
  ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
  ; CHECK-NEXT:    movb $4, %al
  ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0
  ; CHECK-NEXT:    retq
    %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
    %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec2
@@ -3874,10 +3873,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double>
  ; CHECK:       # BB#0:
  ; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
  ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
  ; CHECK-NEXT:    movb $4, %al
  ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
  ; CHECK-NEXT:    retq
    %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
    %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll

index 5fd38b146f5597f07facc91431f6720cbe336dbd..6aa7f4001df4ae7ef920bc8ac0ad001095ffde0b 100644 (file)
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -722,21 +722,33 @@ define <8 x float> @shuffle_v8f32_32103210(<8 x float> %a, <8 x float> %b) {
  }
  
  define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) {
-; ALL-LABEL: shuffle_v8f32_76547654:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; ALL-NEXT:    retq
+; AVX1OR2-LABEL: shuffle_v8f32_76547654:
+; AVX1OR2:       # BB#0:
+; AVX1OR2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8f32_76547654:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512VL-NEXT:    retq
    %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
    ret <8 x float> %shuffle
  }
  
  define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) {
-; ALL-LABEL: shuffle_v8f32_76543210:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; ALL-NEXT:    retq
+; AVX1OR2-LABEL: shuffle_v8f32_76543210:
+; AVX1OR2:       # BB#0:
+; AVX1OR2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8f32_76543210:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512VL-NEXT:    retq
    %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
    ret <8 x float> %shuffle
  }
@@ -1809,11 +1821,17 @@ define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) {
  ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
  ; AVX1-NEXT:    retq
  ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_76547654:
-; AVX2OR512VL:       # BB#0:
-; AVX2OR512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_76547654:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_76547654:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512VL-NEXT:    retq
    %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
    ret <8 x i32> %shuffle
  }
@@ -1825,11 +1843,17 @@ define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) {
  ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
  ; AVX1-NEXT:    retq
  ;
-; AVX2OR512VL-LABEL: shuffle_v8i32_76543210:
-; AVX2OR512VL:       # BB#0:
-; AVX2OR512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2OR512VL-NEXT:    retq
+; AVX2-LABEL: shuffle_v8i32_76543210:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_76543210:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512VL-NEXT:    retq
    %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
    ret <8 x i32> %shuffle
  }
author	Craig Topper <craig.topper@intel.com>
	Fri, 15 Sep 2017 18:11:13 +0000 (18:11 +0000)
committer	Craig Topper <craig.topper@intel.com>
	Fri, 15 Sep 2017 18:11:13 +0000 (18:11 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/avx-vperm2x128.ll		patch \| blob \| history
test/CodeGen/X86/avx512-shuffles/partial_permute.ll		patch \| blob \| history
test/CodeGen/X86/vector-shuffle-256-v8.ll		patch \| blob \| history